From 4a95d88ce2aa36452f4479a069b031ff2c1d4a24 Mon Sep 17 00:00:00 2001 From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> Date: Tue, 19 Aug 2025 11:44:36 +0800 Subject: [PATCH] revert tlg kernels for ease of merge Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> --- .../batchedGemm/KernelRunner.cpp | 54 +- .../batchedGemm/KernelRunner.h | 30 +- .../BatchedGemmInterface.h | 25 - .../trtllmGen_bmm_export/BatchedGemmOptions.h | 56 +- .../GemmGatedActOptions.h | 2 +- .../trtllmGen_bmm_export/GemmOptions.h | 4 +- .../trtllmGen_bmm_export/KernelMetaInfo.h | 16634 +++++++++++++++- .../trtllmGen_bmm_export/KernelParams.h | 114 +- .../trtllmGen_bmm_export/KernelParamsDecl.h | 51 +- .../trtllmGen_bmm_export/KernelTraits.h | 74 +- .../trtllmGen_bmm_export/TmaDescriptor.h | 6 +- .../trtllmGen_bmm_export/config.json | 271 +- ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedP_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._schedS_biasM_bN_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s4_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._s3_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...N_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp} | 4 +- ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + .../blockScaleMoe/DevKernel.cu | 15 +- .../blockScaleMoe/DevKernel.h | 5 + .../blockScaleMoe/RoutingRenormalize.cu | 165 +- .../trtllmGenKernels/blockScaleMoe/runner.cu | 121 +- .../trtllmGenKernels/blockScaleMoe/runner.h | 40 +- .../trtllmGenKernels/fmha/CMakeLists.txt | 2 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp} | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...Kv128Persistent2CtaKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...2VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...aVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...vVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ16Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...qQ64Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...SeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp | 4 +- ...4VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 + ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 + ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...Q128Kv128PersistentKeepsAbForGen_cubin.cpp | 4 +- ...rSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 4 +- ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 4 +- ...qQ16Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...arSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...eqQ8Kv128PersistentSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp | 4 +- ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...lP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...lP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...eP32VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...eP64VarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...usalVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - ...VarSeqQ128Kv128PersistentContext_cubin.cpp | 3 - ...enseVarSeqQ128Kv128StaticContext_cubin.cpp | 3 - .../fmha/cubin/kernelMetaInfo.h | 3035 +-- .../trtllmGenKernels/fmha/fmhaKernels.h | 39 +- .../trtllmGenKernels/fmha/fmhaRunner.cpp | 5 +- .../trtllmGenKernels/fmha/fmhaRunnerParams.h | 2 + .../trtllmGenKernels/fmha/kernelParams.h | 8 +- .../trtllmGenKernels/gemm/KernelRunner.cpp | 4 +- .../gemm/trtllmGen_gemm_export/Enums.h | 56 - .../trtllmGen_gemm_export/GemmInterface.h | 70 +- .../gemm/trtllmGen_gemm_export/GemmOptions.h | 340 +- .../trtllmGen_gemm_export/KernelMetaInfo.h | 833 +- .../gemm/trtllmGen_gemm_export/KernelParams.h | 764 +- .../trtllmGen_gemm_export/KernelParamsDecl.h | 324 - .../gemm/trtllmGen_gemm_export/KernelTraits.h | 174 +- .../trtllmGen_gemm_export/TmaDescriptor.h | 133 +- .../gemm/trtllmGen_gemm_export/config.json | 9 + ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...tK2_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...tK2_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...tK2_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...plitK2_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...plitK2_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...plitK2_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 - .../gemmGatedAct/KernelRunner.cpp | 22 +- .../gemmGatedAct/KernelRunner.h | 4 +- .../trtllmGen_gatedAct_export/Enums.h | 61 - .../GemmGatedActInterface.h | 110 +- .../GemmGatedActOptions.h | 50 +- .../trtllmGen_gatedAct_export/GemmOptions.h | 604 +- .../KernelMetaInfo.h | 414 +- .../trtllmGen_gatedAct_export/KernelParams.h | 308 +- .../trtllmGen_gatedAct_export/KernelTraits.h | 207 +- .../trtllmGen_gatedAct_export/TmaDescriptor.h | 153 +- ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...ma128x128x32_cluster1x1x1_sm100a_cubin.cpp | 3 + ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...ma128x128x32_cluster1x1x1_sm100a_cubin.cpp | 3 + ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...ma128x128x32_cluster1x1x1_sm100a_cubin.cpp | 3 + ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 + ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 + .../trtllm/gen/CommonUtils.h | 5 - .../trtllm/gen/CudaKernelLauncher.h | 5 - .../trtllm/gen/DtypeDecl.h | 93 +- .../trtllm/gen/MmaDecl.h | 90 - .../trtllm/gen/SfLayoutDecl.h | 5 - 1948 files changed, 23857 insertions(+), 8585 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp rename cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp => Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp} (81%) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp rename cpp/tensorrt_llm/kernels/trtllmGenKernels/{gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp => fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp} (81%) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp index b50c2e2104..b1bc092753 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp @@ -65,6 +65,8 @@ std::vector prioritizePredefinedConfigs(int m, int n, int k, std::vecto // // Dummy // + + // Qwen3_235B_TP8_EP1_MoE_FC2 m=4096 k=192 if (n /* out_dim */ == 0 && k /* in_dim */ == 0) { auto pred = [](BatchedGemmConfig const& config) @@ -100,13 +102,27 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(TrtllmGenBatchedGemmRunne auto const options = configs[i].mOptions; auto const tileSize = mOptions.transposeMmaOutput ? options.mTileN : options.mTileM; // When we include low-latency kernels we can set transposeMmaOutput via constructor - if (options.mDtypeA == mOptions.eltType && options.mDtypeC == mOptions.outputType - && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8 + if (options.mDtypeA == mOptions.dtypeA && options.mDtypeB == mOptions.dtypeB + && options.mDtypeC == mOptions.dtypeC && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8 && options.mTransposeMmaOutput == mOptions.transposeMmaOutput && (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct && options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch && tileSize == mOptions.tileSize) { + // FIXME: Disable split-k for now. + if (options.mClusterDimZ != 1) + { + continue; + } + + if (options.mFusedAct) + { + if (options.mActType != static_cast(mOptions.actType)) + { + continue; + } + } + if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM) { mPassingConfigIndices.push_back(i); @@ -146,9 +162,10 @@ size_t TrtllmGenBatchedGemmRunner::getWorkspaceSizeInBytes(int32_t m, int32_t n, void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b, void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC, float const* scaleGateC, - void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens, - int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, - void* workspace, CUstream stream, int device, int32_t configIndex) + float const* ptrBias, float const* ptrAlpha, float const* ptrBeta, float const* ptrClampLimit, void* c, + void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens, int32_t const* ctaIdxXyToBatchIdx, + int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream, int device, + int32_t configIndex) { auto bmm = BatchedGemmInterface(); @@ -200,6 +217,10 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto gemmData.mInputBuffers.mPtrScaleGate = scaleGateC; gemmData.mInputBuffers.mPtrPerTokenSfA = mOptions.transposeMmaOutput ? perTokensSfB : perTokensSfA; gemmData.mInputBuffers.mPtrPerTokenSfB = mOptions.transposeMmaOutput ? perTokensSfA : perTokensSfB; + gemmData.mInputBuffers.mPtrBias = ptrBias; + gemmData.mInputBuffers.mPtrSwiGluAlpha = ptrAlpha; + gemmData.mInputBuffers.mPtrSwiGluBeta = ptrBeta; + gemmData.mInputBuffers.mPtrClampLimit = ptrClampLimit; gemmData.mInputBuffers.mPtrRouteMap = routeMap; @@ -242,7 +263,22 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto // Dispatch with block scaling factors and with static batching. run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, sfA, b, sfB, /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr, - /* scaleC */ nullptr, /* scaleGateC */ nullptr, c, outSfC, + /* scaleC */ nullptr, /* scaleGateC */ nullptr, /* ptrBias */ nullptr, /* ptrAlpha */ nullptr, + /* ptrBeta */ nullptr, /* ptrClampLimit */ nullptr, c, outSfC, + /* routeMap */ nullptr, /* totalNumPaddedTokens */ nullptr, + /* ctaIdxXyToBatchIdx */ nullptr, /* ctaIdxXyToMnLimit */ nullptr, + /* numNonExitingCtas */ nullptr, workspace, stream, device, configIndex); +} + +void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, + void const* a, void const* sfA, void const* b, void const* sfB, float const* ptrBias, float const* ptrAlpha, + float const* ptrBeta, float const* ptrClampLimit, void* c, void* outSfC, void* workspace, CUstream stream, + int device, int32_t configIndex) +{ + // Dispatch with block scaling factors and with static batching. + run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, sfA, b, sfB, + /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr, + /* scaleC */ nullptr, /* scaleGateC */ nullptr, ptrBias, ptrAlpha, ptrBeta, ptrClampLimit, c, outSfC, /* routeMap */ nullptr, /* totalNumPaddedTokens */ nullptr, /* ctaIdxXyToBatchIdx */ nullptr, /* ctaIdxXyToMnLimit */ nullptr, /* numNonExitingCtas */ nullptr, workspace, stream, device, configIndex); @@ -255,7 +291,9 @@ void TrtllmGenBatchedGemmRunner::run(int32_t m, int32_t n, int32_t k, std::vecto // Dispatch with block scaling factors and with static batching. run(m, n, k, batchedTokens, /* numTokens */ 0, batchedTokens.size(), /* maxNumCtasInBatchDim */ 0, a, /* sfA */ nullptr, b, /* sfB */ nullptr, /* perTokensSfA */ nullptr, /* perTokensSfB */ nullptr, scaleC, - scaleGateC, c, /* outSfC */ nullptr, + scaleGateC, /* ptrBias */ nullptr, /* ptrAlpha */ nullptr, /* ptrBeta */ nullptr, /* ptrClampLimit */ nullptr, + c, + /* outSfC */ nullptr, /* routeMap */ nullptr, /* totalNumPaddedTokens */ nullptr, /* ctaIdxXyToBatchIdx */ nullptr, /* ctaIdxXyToMnLimit */ nullptr, /* numNonExitingCtas */ nullptr, workspace, stream, device, configIndex); @@ -281,7 +319,6 @@ std::vector TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m gemmData.mProblemDimensions.mRank = 0; gemmData.mProblemDimensions.mWorldSize = 1; gemmData.mProblemDimensions.mMaxNumCtasInTokenDim = maxNumCtasInBatchDim; - // Tier 0: K < tileK, prefer higher efficiency. auto cmpTier0 = [&configs, &gemmData](int64_t idx0, int64_t idx1) { @@ -343,7 +380,6 @@ std::vector TrtllmGenBatchedGemmRunner::getValidConfigIndices(int32_t m } return false; }; - // Sort configs by options. std::vector sortedIndices = mPassingConfigIndices; std::sort(sortedIndices.begin(), sortedIndices.end(), cmpTier0); diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h index 6c87de22fd..7d8ca3eb21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h @@ -27,10 +27,28 @@ namespace tensorrt_llm namespace kernels { +// Keep this in sync with the ActType in +// cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h +enum class ActType +{ + // For ActType == SwiGlu, ideally we would like to have something like + // gatedAct = scaleC * (x0 * scaleAb + beta) * ((x1 * scaleGate) * sigmoid(alpha * x1 * + // scaleGate)). + // But for now, we use the simplified version + // gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)), + // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales, + // beta' = beta / scaleAb, scaleC' = scaleC * scaleAb. + // + // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0. + SwiGlu +}; + struct TrtllmGenBatchedGemmRunnerOptions { - batchedGemm::trtllm::gen::Dtype eltType; - batchedGemm::trtllm::gen::Dtype outputType; + batchedGemm::trtllm::gen::Dtype dtypeA; + batchedGemm::trtllm::gen::Dtype dtypeB; + batchedGemm::trtllm::gen::Dtype dtypeC; + ActType actType{ActType::SwiGlu}; bool deepSeekFp8{false}; bool fusedAct{false}; bool routeAct{false}; @@ -53,7 +71,8 @@ public: void run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b, void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC, - float const* scaleGateC, void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens, + float const* scaleGateC, float const* bias, float const* swiGluAlpha, float const* swiGluBeta, + float const* clampLimit, void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens, int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream, int device, int32_t configIndex); @@ -62,6 +81,11 @@ public: void const* b, void const* sfB, void* c, void* outSfC, void* workspace, CUstream stream, int device, int32_t configIndex); + void run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, void const* a, void const* sfA, + void const* b, void const* sfB, float const* bias, float const* swiGluAlpha, float const* swiGluBeta, + float const* clampLimit, void* c, void* outSfC, void* workspace, CUstream stream, int device, + int32_t configIndex); + // FP8 per-tensor scaling GEMM void run(int32_t m, int32_t n, int32_t k, std::vector const& batchedTokens, void const* a, void const* b, float const* scaleC, float const* scaleGateC, void* c, void* workspace, CUstream stream, int device, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 49d23d13dd..2da28940b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -247,35 +247,10 @@ struct BatchedGemmData // The clamp limit for the accumulator before applying the activation. // Shape is [B]. // Clamp is INF if nullptr. - // When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb. // If applied on SwiGlu, it will be: // // x_glu = x_glu.clamp(min=None, max=limit) // x_linear = x_linear.clamp(min=-limit, max=limit) - // - // The given clamp limit applies to the dequantized values, so the order of operations would - // look something like this: - // - // x0 = x0 * dqAb - // x0 = clamp(x0, none, limit) - // x0 = x0 * sigmoid(alpha * x0) - // x1 = dqAb * x1 - // x1 = clamp(x1, -limit, limit) - // out = qC * (x1 + beta) * x0 - // - // Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp - // limit and apply the clamping prior to dequantization: - // - // x0 = clamp(x0, none, limit / dqAb) - // x0 = x0 * dqAb - // x0 = x0 * sigmoid(alpha * x0) - // x1 = clamp(x1, -limit / dqAb, limit / dqAb) - // scaleC = dqAb * qC - // beta' = beta / dqAb - // out = scaleC * (x1 + beta') * x0 - // - // Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case - // float const* mPtrClampLimit{nullptr}; // The alpha and beta for SwiGlu. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h index 01b85f4bcc..8388779f75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h @@ -99,7 +99,7 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector batchedM, std::vector batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting, bool fusedAct, - int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt) + int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps) : gemmGatedAct::GemmGatedActOptions( gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, @@ -116,16 +116,15 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions , mBatchedM(batchedM) , mBatchedN(batchedN) , mBatchMode(BatchMode(batchMode)) - , mFusedAct(fusedAct) - , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting) - , mIsStaticBatch(isStaticBatch) , mNumBatches(numBatches) + , mIsStaticBatch(isStaticBatch) + , mNumTokens(numTokens) + , mRouteImpl(routeImpl) + , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting) + , mFusedAct(fusedAct) , mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp) , mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp) , mNumRegsCastAWarps(numRegsCastAWarps) - , mNumTokens(numTokens) - , mRouteImpl(routeImpl) - , mUseTmaOobOpt(useTmaOobOpt) { } @@ -135,28 +134,28 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions std::vector mBatchedN; // Whether batching M or N. BatchMode mBatchMode{BatchMode::BatchM}; - // Whether to perform a fused gated activation. - bool mFusedAct{false}; + // Number of Gemm batches. + int mNumBatches; + + // Whether the batch size is static (i.e. known at kernel launch time). + bool mIsStaticBatch{true}; + // Total number of tokens. + int mNumTokens{32}; + // Whether load the input tokens and do routing. + RouteImpl mRouteImpl{RouteImpl::NoRoute}; // Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens, // ptrCtaIdxXyToBatchIdx, etc.. should wait on a grid dependency. bool mGridWaitForPrimaryRouting{true}; - // Whether the batch size is static (i.e. known at kernel launch time). - bool mIsStaticBatch{true}; - // Number of Gemm batches. - int mNumBatches; + + // Whether to perform a fused gated activation. + bool mFusedAct{false}; + // Number of registers per thread for non-epilogue warps int mNumRegsPerThreadNonEpilogueWarp{0}; // Number of registers per thread for epilogue warps int mNumRegsPerThreadEpilogueWarp{0}; // Number of registers for the cast A warps. int mNumRegsCastAWarps{0}; - // Total number of tokens. - int mNumTokens{32}; - // Whether load the input tokens and do routing. - RouteImpl mRouteImpl{RouteImpl::NoRoute}; - // Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in - // BatchedGemm/KernelParamsDecl.h. - bool mUseTmaOobOpt{false}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -166,20 +165,6 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw { bool isValid = true; - if (options.mUseTmaOobOpt && !options.mUseTwoTmaLoadWarps) - { - if (updateOptions) - { - // Since any routing (mRouteAct != NoRoute) requires mUseTwoTmaLoadWarps == true. - // Single TMA load warp is not the target use case for OOB optimization. - options.mUseTmaOobOpt = false; - } - else - { - TLLM_CHECK_ERROR(false, "TMA OOB optimization requires two TMA load warps."); - return false; - } - } if (options.mFusedAct) { // ensure that we check the fused options as well @@ -382,8 +367,7 @@ inline std::string dumpOptions(BatchedGemmOptions const& options) ss << "mFusedAct=" << options.mFusedAct << "," << std::endl; ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << "," << std::endl; ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << "," << std::endl; - ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl; - ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl; + ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << std::endl; return ss.str(); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h index deedee27ca..7970b7920f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h @@ -179,7 +179,7 @@ inline std::string dumpOptions(GemmGatedActOptions const& options) ss << gemm::dumpOptions(options) << ", "; ss << "mActType=" << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")," << std::endl; - ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl; + ss << "mClampLimit=" << options.mClampBeforeAct << "," << std::endl; return ss.str(); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h index 7d25c117a5..7a1155e567 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h @@ -527,7 +527,6 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) inline bool checkAndUpdateGemmOptions( GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) { - if (options.mDtypeB == tg::Dtype::Void) { if (updateOptions) @@ -568,8 +567,7 @@ inline bool checkAndUpdateGemmOptions( // Currently, we only support {MxFp4, NvFp4} -> Bf16. TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) - && options.mDtypeMmaA == tg::Dtype::Bfloat16) - || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), + && options.mDtypeMmaA == tg::Dtype::Bfloat16), "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); // Check that the B cast is supported. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index 32b52710cb..284e03c794 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -28,229 +28,597 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "32110ebf-dirty" +#define TLLM_GEN_COMMIT "018762ca-dirty" #define TLLM_GEN_EXPORT_VERSION "7.0.3.0.3.0" -static constexpr size_t tllmGenBatchedGemmListLen = 104; +static constexpr size_t tllmGenBatchedGemmListLen = 288; #ifndef EXCLUDE_SM_100 -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { #ifndef EXCLUDE_SM_100 -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "6a0bf2c102efef21017cfd8c1ea75e72cbadee5e1cb82c2abfbb2370cf28948d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "dcd13f84a37c44fd9ba2672e0776f4cbcacc998022748a931c209ee227a38097", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -318,7 +686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -331,9 +699,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "a3126bb4254dcab10372faae7e93a2c743ce97735fc19ba35bda963f524ea8f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "1e68104f4cc4a1f5a2b12d4ea0c910e9ae795f672b53c23f7a6d5e14a4af0600", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -401,7 +768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -414,9 +781,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d6ecf38671485ed929e33358d61c931ed1851601dba1a899d4e0f7484e8aa6d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "113498b9badea7f3a222bea985f32c4caddccaa43ae146f4b0e15bced239bd5f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -484,7 +850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -497,9 +863,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "848bd68403ebc01cd94dcb0988f5890f914bfed6bafa4a1470b34902516eca49", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "be4715e44609995676ef38df904a28f0979e1d4e5e46ba1af95dfa8b65df3daf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -567,7 +932,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -580,9 +945,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "25a8cb0f1b214b59a76feae166eb73dcff644be5065987846dba5eb393fcba19", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "e5576e97179bfed8e8e7bed2270c0c78c6b09d83d0fc64d5996b89d37f3565ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -650,7 +1014,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -663,9 +1027,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "0f582ab94bcee39077607621b7fed01ce79838f77f3abf17ce65439bf57ab9d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "2d42200e86b01b530fcddc9ea1db1d0692ff90352c46a112605544749fa54f38", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -733,7 +1096,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -746,9 +1109,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "94b39523665c17ec783c22b4fbf47523132146e365b4c450db6ba4b6f52afb56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "40f54da044cd96cf53159887a0403146b311961223fd7c5fc5951eb5ee6755ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -816,7 +1178,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -829,9 +1191,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d18a2f733180a782f35e022fbbe76417e8c143a117e6d6b80045fe1faf48d38c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "71873a5224fafe04d2b873c4a662d180b522e64bcc97ced54be68c381a865ced", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -899,7 +1260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -912,9 +1273,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0eb05008fb00aac51d2e132d1dad9587b602aced72a2a60a036b4ddb30acb781", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d33facd7f87429fca3369d0c58b3f942e79fc930be3882ab12520aa7d7da60b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -982,7 +1342,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -995,9 +1355,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "378127bf506de81c353b6c9ad1567f567fbc8f33b5fb799217c52e44ffc9a0e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "b8b5e5ccbb44ee2044473acb61e7c3fad95b36d1972505017d8d35ff5e593d00", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1065,7 +1424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1078,9 +1437,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "5bc57208f3139adf191b7a21c5fd987e85268aba7f960af0759a413b3a477510", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "389aefd66505a590d1f5dc792693e91faa40df35d8c37e816d082661a9f44256", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1148,7 +1506,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1161,9 +1519,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "3aa161b02771849cb272013bfa7964e9bf5aa10712e653fbd12213baf9a4bd4b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "28022d9891f75b55fec5245c93e9b076631c0004661ecda452e8901822d160ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1231,7 +1588,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1244,9 +1601,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "dc1c044fbb88c65f06c90b998bff9dfca616a5e45c5f9189d01063d9300ca1a6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "410025847d05c7264d40413d15af0d3c9d71ba42fda96d392d3509b58a2f4f21", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1314,7 +1670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1327,9 +1683,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "fa6f4a009ed7eb28b0f9b3a12cd2174b5fb53bf869f698b3a708548ec34157a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "009532ad75e4498d137d6f9ac513f77b96289796e94eb16f1bd295ed2a354b43", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1397,7 +1752,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1410,9 +1765,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "783ab74ed00ce17fcad0aa518d27fc54ffa3a5a5d35a1fae41560e55c56c4534", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d910ab811882f9c380923ce2c45f08b2aacfaaf4183735fda114883e481184be", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1480,7 +1834,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1493,9 +1847,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "89b83df6a7fdb94d161b15c1acaa99e5d009eb1db723598d31512fc293d14825", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "f1a46a74d334aec0eb8a258e293023ce0f92527de23a1336b6cfba78912d3b87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1563,7 +1916,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1576,9 +1929,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "54a368788db560b6ab92ba6de4503a9d715a660396f85f29859133236feb0c66", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0937e122abc610bd41b90a7474c6dc30be0b07e010a451576bf128dbccec6b11", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1646,7 +1998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1659,9 +2011,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "685ff9164cbb048dafde74cdb3a9e83fdafc3585f9d1fde713bc40fbbcc5ba1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d54c0f92a03f62002fff4c5f9e3915a183137a66421be8571f19b2ed3f16c4bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1729,7 +2080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1742,9 +2093,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "c21e7ffe165ed982b6515aac1e99513237efcc7c2b612d7a8ba04583d4503938", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "2e09c69e28b99da68ab8551673306a4eef46d805b26eda1490d454ad30e33d95", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1812,7 +2162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1825,9 +2175,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "e94003ae6b02fbb2872455161ed537c65c13bc49ed1bb4173dc950ca36fd4d9c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "7f087ba5797c3995c9da864031b21368dedbfdbeb44703fd40f1a83f2d2d063d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1895,7 +2244,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1908,9 +2257,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "e51bf3386fd7135ef7a21eb495fb8a82eec138b68729b5f60e992c47739d7e6c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "a1a6bb42f366836d4a9e6daaf96b8af89f911b3106dcc0b7222513f30e9ba3ec", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1978,7 +2326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1991,9 +2339,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "9f9498b2cba3f9cffc14f6f3c2b1bf5d49d175da485fdd28bdc7e33d84bc6fa5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "5eac77766bcc959bdb459108c244e70464a1ae048a01214483f75c02e8de297c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2061,7 +2408,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2074,9 +2421,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0e1f1f36b5de227bf129c1b4ab9f9346b8e586f894dfe1ef8d110150c0ea4d03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "fdccf72f527a1c70236993d8f2f951ecaf971808014fa2afad8479bcc7bc28fd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2144,7 +2490,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2157,9 +2503,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d74ce52dbeb5c157047bf4c333ceca8bf2faaea6d453c6870e7b4e9a0bffbfc5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "83cde2971a937a02642b018c2dc62a3d098b032d2961c4c15ecf23d94c6f4759", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2227,7 +2572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2240,9 +2585,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "480e7f648ebd09b8238ab40897e1b4c5a5034d6d82df66e64912d999fbcbd6a0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "004915fae60a35731038e2af4aebf70f7fa0facf0e2c923df209086c3e5ae115", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2310,7 +2654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2323,9 +2667,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "cf2750f986199840ad6f9505db4791b98321bc81f0b097630eee5088d142f986", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "2ebbbcba52b8b00c94571b04afc99c47f27a141529707096a82ee146dd0f7485", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2393,7 +2736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2406,9 +2749,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "65981a268c85412e963e9eff70363151eb568d3bf5ab4177909ed36f98afa006", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "800e3f410f62345b1dcbd66f321ba3d9e16d323c45930a1fb4b9b31eaebfde87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2476,7 +2818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2489,9 +2831,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "32e40d5c9a6bac77cde65c0a7dc8c16621aa84d83a9cfaf7899aa5545e068530", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "614a29b92801de07c8b36f071b051fc71025b10db57b1b72b31c71f578da74b6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2559,7 +2900,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2572,9 +2913,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "9ddd33b56695cff3d8dd10c0fcaf9499be97858bcdca9bf4a4e5d6ba0d84c0eb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "10b05da0f3ae183ca9749281c961fd61ac7e5f33ccaadc9584db5f705aeab879", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2642,7 +2982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2655,9 +2995,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "7d1f7c8a2e70a49a877399c5c3d2fc7df6f9cbaf2c0a0019c8f476485d618b6e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "35f66e3d559e2ed5e28c60fafbdb670b97cb4e5a1a1aaec46a46246db644b3ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2725,7 +3064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2738,9 +3077,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "9df00bde72fe7c36c09c2989183fccaaa90f7a4e79d6d0cce36c78049a270a85", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "c579bf5faa33ac5febee73279e2d49e4ad1001a3d360999bcf8b323b820c5649", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2808,7 +3146,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2821,9 +3159,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "b61ee79c9c7595deee79157bd2d10de9d500b9c36d68a569b585d80ed854c35a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d173dd09116ccf1f0afcda70c3d79ac29e20120c31eea2a1dcea587c48efe25d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2891,7 +3228,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2904,9 +3241,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "1ff7246442e16863ccf377d88a7bee44e8dd96aeb16e6bdb4a6dd5f84b40ff84", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "99b92263c3e83a77c500fd2a2200cfdf68380d652092a586ef4863fa0de56f18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2974,7 +3310,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2987,9 +3323,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "759fceb6fdf0026cf171b5c5aa8468a18930170c8d52813b0c3d5c4ef825b6c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "379be03f3d7bb8a714c2c288c7c3ec5916d3d59e46958c331ce43a4cd5c01ccb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3057,7 +3392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3070,9 +3405,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "c10774753e0c8c08bf8c90aaf8a8f81d6e2dc0fda372beb895b5c3af7fc4ef31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "db1d6a3775ac8b4cd988507b5a1ca47349ae56faee8b9a9809f6498cbd805c6f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3140,7 +3474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3153,9 +3487,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "80ae2cb645bb1ea1a885f7223dc75f1a12a14ec1e6519e8355c9d4af179c1e30", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "1d9f64edf31656d8742eec78363b91fb9b3911971ccf51d32969a7a245509274", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3223,7 +3556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3236,9 +3569,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "a8ebd0574dc30c1e06d4da89716e4c040109d5481718e9bd33bd2d919f933e53", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "d570d48f01aeba2ca9976fc64e5673347cd4aead9db8cc5bf4f284519e51ea9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3306,7 +3638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3319,9 +3651,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "e4c4e89c846e3b3250dc10e30ac956d04d14957d738118b019e934890b00c108", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "cc5ab7c76ce7cbf7a2695742322c2123e30998c626dedd1a204785d0556e18ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3389,7 +3720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3402,9 +3733,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "881acb91bd5ff416df31d7e9b95206c9a7349f3e0f06d3313fa1c9e642fa88bf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "4bd1321a78b142af51405c66058f8acad5d3ff1e3cada9e4dedd59617117c11d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3472,7 +3802,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3485,9 +3815,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "ab9799b4898c64e9c1f13d351dd32a59e290486f4e3321863d691f4029b61c42", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "903da0851e9494530288a986c6c8d94a4a04525bf927cd359db68fefce9f4615", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3555,7 +3884,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3568,9 +3897,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "ffba44ffc7b841b0b9073057e8e210c2077091bc7f6139bb553d4a701daec667", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "28351355d8e51420290ab29001b4d4d3ad8befabe81d997beb44b094dc7d1670", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3638,7 +3966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3651,9 +3979,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "b1711c24e234aaf742a93d5bfef0dc2c311530929f95803918c54d45e12a99c2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "6ffb84a95c2803867a0492ee024e16fc32d0986514b04fc43c36bfb62d3e92db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3721,7 +4048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3734,9 +4061,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "0c81fd570d07a6321297366a9c9f63b48f918a4a4c7b5fdf6691a994da3d80e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "318ee60c00683bfc467e7c81cab73ee79743cc4d7029bef6e6f1f3a651efc7c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3804,7 +4130,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3817,9 +4143,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d7223efec141e24a2ed159dd0df9af8f753528628349fcd351d522a33e8a8276", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d508ca1ee96c46570f2ea499fe06125d7d731eda1c07fff1523ff45b6b24b024", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3887,7 +4212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3900,9 +4225,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "8a965ae0938f555b7b0e73df5359c2b94c105d9c851f62946bafbf144f3c9ed9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "abc85e11c4481af85cbbcf640da574a8054267c92c8fcba8aa03f85736ae1498", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3970,7 +4294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3983,9 +4307,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "83578cefe6e21fc33064333510c1286de8420a883cca7a93c608c4e4951677aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "cd6d0249dfa2c85e631efe1068d09e38db0e26cdf4914edcff5448bcdfcf190e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4053,7 +4376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4066,9 +4389,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "f479ff64bf61dfc0bcba3d4d45903dc296010c55feade97734082aff2de5d0bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "28ebc984a3ac52fc2bee2959af1788fe9037fd70b762ff675856eb4efb2be9da", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4136,7 +4458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4149,9 +4471,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "c8f81b23270d852d3e05b6450d0ed40b0d909a9f379031448a6b1bee209394bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "eef8766fd9007de7ac4d11c2e6f641c7e965bbf48b7298c9c4a4945168549826", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4219,7 +4540,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4232,9 +4553,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "2e7362e02b49aa744e37838af305803da9e132e73affcad605bd9495bf3902bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "52229774c9dfeeadbf9cc21df823fe932d75d08a1f7efcd7dbef384911472038", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4302,7 +4622,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4315,9 +4635,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "051562e78b1477ddef4292dbc040048ca3c85ca6dc269b3346a8cebb7775c254", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "7339a9b452379a2d29ffbc8c814c51e0ab7f653a9b28cdc12c0f84b84c86e07c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4385,7 +4704,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4398,9 +4717,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "976391756f5832221aba8f0097f93a0cd02e2cde58ce5a28bfad5fef6ac645c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "45c3270f9b05e3830942473e363dacc7f68a92f44b3c4215d0ed82d38787823b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4468,7 +4786,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4481,9 +4799,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "66c8412d8c3ebf58b73ec4554f40f5830904f5065d70692127004c0ab04e7330", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d6ffa84613e870260260388e2d158f13a7844c0adb9cfb6b392665a185ed7ac8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4551,7 +4868,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4564,9 +4881,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "44bc9b7ab82a5c8d7766ea00c5b044139c11cd5e10d2ed63f8890046a693e6e7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "1da73ca97ad2e6c64d3d1d1f3ae5f35cb354f89a9ce0a33302e21e37dbb8af93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4634,7 +4950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4647,9 +4963,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "6db5429b6d878d9c6d271de05b9edaf402a36a8489e093fe79d7e78d7b0feee8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "9ea86e655333b9c1221c54ba75b24c1a1f980c9994f7b61e7508e4b9d5ad231a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4717,7 +5032,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4730,9 +5045,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "06ed8ead992feb9910c535c2a1a91824861afdfd7e81d82bc4631746bd35ef34", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "7e32ec83f90269537c5849999bbf3b35f22b7c533f19a3d32adaf6e97c6bc367", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4800,7 +5114,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4813,9 +5127,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "df44cda477bd4710b32d020b9832a160202dd54c446ec1a0c35c8fa3a4ea7285", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "c52912cf605631efdd07851260e37c6823c2c4fe0517d4205ec3e0e6fba8e9b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4883,7 +5196,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4896,9 +5209,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "3c781c5cde3e9cb98e53f1b1be7a527244d9250fca90d3b32af8f384d350deac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "42dccf4301719f8d55ac6faaf37be64ad2dd60ee1b080303c8a48eb4053f8992", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4966,7 +5278,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4979,9 +5291,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "7d0191cb8570a15064c853dd7be2578c81657e1a43fe42b24271c993d4b9ddea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "2f571039709449bae6b168c12e1b8971bfbbc0ef18085cb34f76b0d65cb505d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5049,7 +5360,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5062,9 +5373,8864 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "de8e30b21e02c42bdc7778a538097d2afda49b07aaca23dca798dbbf7a757759", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "e96578c3444c8cd0eb6aabdb39819274f8df0525513d6a5387d01bfc73f5a97a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "427e31447b632d50c306c99701874ab2d6d0a490556a66091177bc162a272109", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "5161b585e08b5f805d6bd51bfa44beab9eca8026d3f2cb676251b30770b18412", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ba2f9978b7b7ffcb3fd39d84280dc39b29537f61f204c06ad4712d6cfb662faa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "23d1464a07d486390a6db8ba1bb259eaff46df4fd8763334a95b940c11b822f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "a5a067f8574ab087e7efd6101c38316caf1c1738de37844ab8dcc70f1a9b0571", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "5ed78f6bb035ab4e60e91c3604e4a364d5f042e8d70eecef37789c60b2f7d91a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ecf6abaeb1c673d59c003a173a5154be6341d417bbb38b87020eb0fdc346c54f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "ad9f76c9b6b674ef1e1223babdf20bd09865424cc21a2f57f6061df2d3adc847", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "7961c6301a165be223063baff2b348dd65d335c92f0b8dadbd23f2e42c677b3b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "37ba57822f0f9c45aca92e9ab2a0f83e0c96755d8cdf1238e2db5a7e08043efa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "83af9d72593eeb32d5e765d38b8cc288494e9c7a2bb077c914e1f6850f9d6d2e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "b6df5dc66b3ef8f14f05af4765edac53bf40d8be5b8d315aa86c78e0650acb65", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "e0b4c122d8d65a95e142ce4f70f228400a44b1ef7b828a84767f8163646df302", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "00f94d2fdd1c95f044f1cc68b7f4d5b44521c6b5ea2835e73598181400140b0d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "0fff8ff7967dfaa1f418bc2be3cd53c76cf63330eb5a5d70067288cebd3faa67", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "fed579bfab38ca7f85fc8256cb7ec6f4680b5fffce8f444190868f917681ab2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "c900a1555ae014b7b825ea96636f6d51c405b1e5806bdf4900aa82548cb5db52", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "d02e30e5618fd611ad1992dc7f95c936a739108cb591995dd9651f571fc3e017", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "087d0217b71e5813a71732f7b2187d296c0baab61a53830d413e8698a79de66a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "80fbe3f75b6b33f20fee695f00bf66eb744303eada09b6acba45713b20337e7a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "62e2b0a34de2566d69571c94a7235e3cdad99b204944ee74527c8915b44dfbc0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "9e46bdbd3f22c22deb7579945d49429dc6855dad6c6775ccddd0bc2bdeb14cf7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "be32940a00f0fa07a5f1f78b76880459299dd6d27d9d72e618fb3694f29e6917", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "7367037d729f69ad78f378a45475db110e93a471e4db932ba34a1cc9ae1e715b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "065cc8ffceb97f61866a4d200895cac1d20a06aa49f68f5f811bd6589cd6061e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "59438173979e8a11bb188964d8b8bd50d72d37e7475097d619914c7301393bc2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "26392ac248f0073a4c3e08b8356e0272b88b021234172a11c39f405e7602e65e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "b77ef9f4f1af44c284bb993f9e912b9bd34ea7a0886dece220585268ede59b9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "e9c1792527ad8992c85e97bfa5c397fcff48bfe20fa3633421f2e55d694f9bac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "69e49961b7a5f0dd8fb31b2d88244588f6b64b97fc9908a4e4b884718d4f0ec6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ef88202a4083d632f2685a2762db0e1b991c7f66f47a379a528d7ff3846f9b14", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1052672) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 16 +, /* mMmaKind */ trtllm::gen::MmaKind(1) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 1 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "80b7e6dae68028114ed485e5e0f33d8c6c2873c19d23308db7e593c13c2a5ec6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "07639d69c0c1a72138b2d5f3662cf08fb65da88ada484aa19f4fcdceeed468b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "3fe162fcc943fb65da67f06e97ab25cf4fd23108abd8eaf85379713d16a1db6b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "98c0b1aa9ab6142c3ed3609f7786ea4846b99b9fa2c38cb1caf30ea92a48f3f9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "b25c9225357591e379058f33307506e6a55c7a9b5204dddb69da133e1eba9962", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "87898c8325062914de9feb49d24f145355f43702762f0f492023e49fe1e7e9ca", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "1891d1e081176c3662f4bcd17650c087427715b28239d7602740eb6403e8ae62", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c1e13621767a488961d8d7b53e2527ef5baf9d609fe9b0d4473d6c6231a8bcd9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "2ea013b7bfac8afc017b2ea59c711990c63a4e11f90bf6990dec837ba1106ca0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "d4a05e73dc6f5f90e0fb1bc43a05cf077ccfd45d38279066390d788ed11a5471", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "d80cad89d57eee85ac6a0c5e1c6c7bff979418f69717337fc42b4e4665cf82ad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "77cc7529c6f37b3dec1c063a9824cfea25fe94f53bd43f43801767ced25a744d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "9f01d68fa4680d1a4cb69b8fce80e058fc69117a954a208cf4dbd47981d2bdb3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "fb020c5d4a7fe5419e04fbfc52b2dae40cc7187809b163df089efa0ac1196d03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "42305021ccca834763052664b8d80aa95d75c5d537ad59f1a234753fb216b418", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "95bca146bcbcd3e0b1eeb76e6aaa12752b8bb9a6ed2fa2cf27da5e2a625741b2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "e6ec3c422e856d61045cd503c3364c63e86ea844d23fa29a9082262125b62340", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c59efadb84de8811f4c257cb0d89c36ee6bc1a0bda49aa5fe290d5b377218754", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "f550e8bcf5c4d3b10bdd5b842f6f681b493eaacef07ccdd02900902c2dd2da89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "bef98c26bee0af506e4a134dd6a974bdc33c1ffb09f5555c1c04eccec6bd3e6e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "1dd886c46f49e7634b13bed151c89bc39c3eb0ecb0a50251acb3d98b6dfd8bb2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "2391817e2b783dda84fae2a98e369231e4cc5adacac8050d4964057091926acb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "5be76c0bee34f45382984a49d1e2f4079ba6563fcc90e1718f272c46250fb657", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "f84b0785a6c44d1a172314a1bc47e96033cdbabc0f60b6d8430478465a7ea07f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "232e0743ec76b2f621b4ca846a7aa31bc2443ae2aa573e407878044dc93682f1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "d409fa0f4fcbd7ff19cd043f710d90f78b7f5662656c3ee82d9320513df099ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "325169814fc8c4f8a138160ec98f4369f124055fef360fad20c3892afbd1d211", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "98503846cdb753846286489bf3cfd826beaaac709504687910249f086400d6d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "06684ff5dc0a8228e70b3b210471e053703354ea4da00513f0140ecc85b7d344", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "9b8e0c96d0299072f75d2d25f5144d9152f85f0c4c76fc21f8c57b3fb9fc149d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "b4581de6077ce5cdf9ec89f471712cda5461112d9d291be84efb15e0a7fd90a9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "5face2201c47b9e7e39bd25665a565a9ab060dea17c1a447d5655d36f817c7e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "1f502efb591936ec18bca1cb98958ef7f9e61d35695e31f91033178f1b84ae06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "d4a7fbd8d97e192622e78deb9d6e29808fe8c20315b92a5d8282868ef4ddd49d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "b362cd22bfe5f02d0f50810bbe32cede78b064686cec40fd6171664e12703484", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "eb7fe5801ecb61eb5eecfecaa551511a89d99c83256f0d0d8b7ecd74cee1f1db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "fccddd52aa24457529fd60ebea542736416eba3cad140fb56f46fdf4cf803d1a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "9ca668d46c5260d257621784fe30f831d0929e152c314c395b0b6ab713b65f77", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "450291d4661183f7b8083d8b6aefcb86cdbc153e5f866bea898606760f743fe1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "779e1d33522dde4022fbbe984140522999d422d6bbd8f12bb5f33c82c06cf6e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "ecfd46bed256ba455f9b3057243ecb4bcbe0b6e486c5a5b71d4e4199d24ecf36", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "bdc1d45e29b39f894e990693c935d2c8a8b16fb36a298e5acba41956e168492a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "b20b1b0ec3067aeee05866d120d87e68e4cdc6bab03f20f2ed0a5538fe9f13f0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "9df59210c235eb0b19230c45917140a3a5722f22f456f5ec3586aa8a75eff4fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "e354ae366b75d8f8c0ac169c420a6dcabd296a276ed48ceac5975a638947feef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "ccb819c4907a8e7a314c3f9c347ef65448d7690e8b546df0d3bd0aeeccf881af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "de755e7f119531f6bcdb5ed0b7c322206053f47a82066aabbacd936030acec56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "f49d0dd316f7fb020e7935b964f47f274532b2e9272c481f98875478ab8e294b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "24451361a2afb48a415b024c1eaea68ac425149d152b67a4df0b26d9e2eb718f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "7e0cc5d50cd472fc73f1b625301bf12c4360943bba3ed13a6982dcce40b88eff", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "d3ad5c1a681187a9ab8fe466a841b71f1a485cc9fecf9fa8e3efb0cbcae320ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "089c16d27c78df52f119bbc313a991489ec27f82e86d5d5e6bf6ec7c9b42bc75", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "0ecd8bc7fef312561bdf50ae2311790419cdf70f4fe1dbb58587f1e0779cf1c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "62593ff687342b4bc11c11deae17bb18797bd377079994aaa82c6deeedb67539", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "5a9417173b4851635aac29bef61a283ced5ac9065b7eeb11f80e7b5133cf708d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "0117ee0c2d1de9525b6d70c1817372a4eca43d4c171402af2ce106cd6668f6ad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "85a96f3ef4d4fb87fe6410e0282f583c304f57bef50ecf8efd8a10b14e887d15", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "aeb36fa1063a5ade516b0c45efeee854b534077127f88864041f5ba26d09e6db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "27f69561d213d2a5fd27df38abcd43d91fd6c7241df6aaf5069d67888ea119b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "e2731f35767261d005b6c37f10ca462ca518727c4f97781f3aa731ef60f0d1b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "600d2fcff96773f9fb9472077482028d15f7ca7e10b75133ad210550db7c655c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "b8f92e2e5cb8e22f285a31dcf392e4df3211f6e3fe3bbe8e17b9d2a26984bf9f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "c9073b5f74b349203e3ef78ced433724da133d0d7e82cee630def987664ea67c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "297f52ced099c894b755b3b14c9ab1b863455a4c3449bd53dfde39feef1abe0c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "38f22a3aabb4013ab208d60d881e58b973773cb38e2ccb5e16c5f4e8d77bfdf5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "da33c0bd8a973244d48e50a78f6ce85325c87406837bb0271e3b4f6d759f2fe4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "44c6d5f44ed248f39307d8a8b30f2b760ae049648c3b6606c77cc7a969f29b74", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "164b83e48704f0d21be219593f4e45890dff2fab76df7fe862cd31a7efdbd46c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "4233d4ddf70384fde3c422015a3ab1a784da950b768ab3a3a70821d46bb2d733", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "d74d46c6278dbb75d370d2239d747b8553bbd3c7907f9463d3b64199542776fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "d5776b4cebfb8879769ed4503cc36baa5b60daa6134c4bcb9fc71a9a541f006c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "f7e8d887ec639b53cf3c168e5c04ac3f2337769267225b7a85f53e0eb024dbee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "a000741ed0c0119824cc9fc15c2d1d30e567b27141f64ee5a6fc615762f49605", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "e8ddd496ec2d4d23fafa850073d1a89eeb80ed015f4b2644c253567944af56fc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "2847fae3901bc983b75eab52daa3abfce95c3c651b27a2beb4c100e654e5caed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "db43a484bcc9b97e8e1c7e33ee46f5871d622f331905aa62d1d14b5dff68dcb4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 0 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "f9201b4a07e1e8140f76d5f6dfa29dc39f22b025ce1e262ea288072c88a2683f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5132,7 +14298,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5145,9 +14311,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "959d4374bd5435bde60198df8d5cf70e29aa9879b2f1314083b77c6c0381dc1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "fcf92f36d289d249334f0487a4bcdd737c47ffba9f2b93e6dfaccedfe88a8d8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5215,7 +14380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5228,9 +14393,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "968c88380b4577288c7e88b0bed6eb93f1ad63c9b17dc999f3d7b7a5f4c2735b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "8fe3b288869c583d508b813e9f3125d197d2f149a11b588507556daadbb26f2e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5298,7 +14462,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5311,9 +14475,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "d6179d09f734c89c251d182d6f9377b82f1c908f02ced331fc8c5e0caeebeb75", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ccd86d8e1f164fb730575996a4309aa05f01bb59deeeaf3e4f191d5279e2809d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5381,7 +14544,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5394,9 +14557,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "112967cd390a0800afadd8ee956782779787b2598986b00951ed464b7129b9cf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "ad0624cf622d2b2753e7ad558cca0bc92618023cdfd480689fc00a0238d89b81", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5464,7 +14626,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5477,9 +14639,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "a2f3febed9a167985b958e95c0496e26d69563410da2fdfd5897d8ed7e96411b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "77d87f528d6e7e138d3b0e47b7b6f2108fc32f109c16bb03e898016b363ba4bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5547,7 +14708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5560,9 +14721,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "d463f269d50fd378c72f26faa8afa3af0b0eb2eb98aebfa369acda464d08bac1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "5413bfaa6fc81dec10753107df13beeed455c9a2583995c5b3c16a5c4d93300d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5630,7 +14790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5643,9 +14803,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "c00f39dadad7d89058bf39415e3182029d4d9ead814360c59287ed864e91de8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "913d5bbdf477ed35c8ab729a7bb120b501b454804aead123a3cf6d0f139969af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5713,7 +14872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5726,9 +14885,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "d055950053983b9acfc627db7d794e449607a78403c4938e7dd0ce35a5bdce86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6cd2dfc20f9c6845d5ab3072c74d1797e55a5dad9c189c3ee304e730f81e18bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5796,7 +14954,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5809,9 +14967,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "49461402a3a0a2e6bef4ba252f922069f387951f88ffa6a17a585cf737b34082", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "51126ae7438ddb5a64d38db1f73a3c31f8c6761433217938fdfdbdd938fcbad6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5879,7 +15036,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5892,9 +15049,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "1425eae206c8f26ae978ba006ca8c568aa93e558c38fbf9ae1a245412175232d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cc7d6e878b8b4efc5acdf8a819cc6f7c747959acab2d59b5b00888030e9feec7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5962,7 +15118,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5975,9 +15131,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "4e312fcf2ef735587987e0ddf18ff1fea51276b0b2da421bbaadd0e80aff44a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "472450ff9e60666626228881697af489820ad8d665ed4a786a3e781e291f04a0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6045,7 +15200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6058,9 +15213,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4e9f19a11006291683600c47b0817498fe3acd82bbd11b1783d4b677d827b3a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "56a15e9c8027e0542f04694c33352ad76ea38fbdec785002224bb9c6b3ae6731", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6128,7 +15282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6141,9 +15295,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "6354eed0dec928ba6e6c25b4b9d1c4385ae6f9ed9be908c82d87726ebd7317f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "81888d0b169e38fb2c3e5cbe675daa37d189cbbfe2697502975b59e4ce4cdb4d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6211,7 +15364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6224,9 +15377,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "a98f41f0f45bafb35078da89ac4290a1a025b235c5e96bd9fd4ab0dd78bb8e1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "1264399a1042b63c62da0801d993ed4a247b5a34f256f2f88d0d08238b6c491a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6294,7 +15446,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6307,9 +15459,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "ae447ae26810ef8447fe5c8ec183cae3fbd4206f29965f532cf0c89861464516", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4b0e08e3fbf6fa5664e9b68b459593fa3e90c6f7bfeb21fad320c8be6de6bf3e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6377,7 +15528,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6390,9 +15541,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "4cdf70f8dca79f9a7cb001a2440c7ff01b1581d71d36f20032e76e5aaffcd01b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "0504df6db869e2034e687dc9fc180d5ce94f6c6197f892009b5fefb74518e09f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6460,7 +15610,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6473,9 +15623,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "af9f731cb3519d14d81b49896e0b4bf6f7452aaf6374a1bd62a0ea267c92d58f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "52128f053973f0d167be7d30f47730f9f95bf3d94273860287ded9c5c5d1a162", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6543,7 +15692,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6556,9 +15705,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "6df8273d9a887d374a2762c4df8210bdcf2fdf407b9944a904a0ebd576bf6487", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "6e903584077011be3ed6601075491a0c7c08a1555a02d81b590d4f6ed6520963", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6626,7 +15774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6639,9 +15787,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "63138b6c484124ae15eafeebcb041be45626060e93d0af7e87136ac74ba2ba4f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "a33e5bee58bf10d73c19e5745a110f56975a3e2c25a02019c2e2322c5b22235b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6709,7 +15856,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6722,9 +15869,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "e4607395041f8c0a0ad7bf84a783af9e3c82d5aec43ebda987a22ca7f3d774f1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "4082e2b82d8eac9aa80e3e51911aac1cb32121a3b7920ddda66deb2189778b29", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6792,7 +15938,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6805,9 +15951,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "dd4254f835bd2ac7320c5ee7f108acf87d7c108918e8e7581f0551c78481ea23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "62d66554997f9c5cbd5c987f5ce6fab48466c599b97916391082c019b70b0f91", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6875,7 +16020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6888,9 +16033,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "92681b5f9d31cb04b18dad631032459aeb37e28f94a327e3251c1e97d678ade8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "dce9a0dce5842e44165a325870e6226e18637c38ed80f917213322ca3e9453ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6958,7 +16102,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6971,9 +16115,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "eabf7d440d758b2f8a63806443459b1278979022ed52365579b323e48e2bf2c5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "ad54a27256cfef3e505cd36fc7597248198a542937c4c3abf596652cbbd8456b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7041,7 +16184,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7054,9 +16197,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "eb11125d29cbd44870617b2be27b6cc134efb42dd2076e64780bdc3a80d9012f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "0eb9278105296a59046d6ea6150f4a9ac2a106ba954c1755fb5627703d799f1e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7124,7 +16266,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7137,9 +16279,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "67e8a00d8bca5f44f8b93f1b19c1ac0f181a7b757c44374f3411f44f8ec87b1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "6b0f6bbbfde874c0a70a64ba3aabb8a59e28506a91650d1d7fbccf1b13ad38a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7207,7 +16348,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7220,9 +16361,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "faebb1be2f8db4ae77f693a4f0a75dd3986d629009d71f929db24ee06a644c37", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "d923d9394d4d4c415ece025f169420527a183624e79efc788fc8210243934ab3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7290,7 +16430,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7303,9 +16443,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "330e800b99311c05d0fdac89222b0c95c7f561474a0ca59cc1b620e01b246815", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "45264435d756b616dd667f1f3843f65c3ae126d6bc3051216aeebfbdf1c19f5e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7373,7 +16512,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7386,9 +16525,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "e3f90e1d12fa25dbf30d04e4433ac1a9d93fa328c02fd6d0d0f966726fb51344", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "bcdbb85503d191634c6905f89c3950b6c049d488b40e7af6bfd44dafb85cdca8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7456,7 +16594,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7469,9 +16607,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "34e89d6b7ac2a8a35461f2778c6909ee231fa94c7955bbd7ffe5a6c714dd0e9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "190991fd067583e81e7940bfbf72bad68728d1d07c7a8523f56228d3b31cb626", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7539,7 +16676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7552,9 +16689,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "ff3bfed14ac5707da369899773c9a6e7b179b1f788d508ef4076542a41837475", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "615e30e9b473531ecf56ec9eab7138f81ad151ca55be42b8a0281a19661c4426", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7622,7 +16758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7635,9 +16771,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "a77de53ef185c6c9e576d904651f4730041804db5adc0a93d000f770a35ebb4c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "1856e55a275018c2115770abdd9d1fe34fa38d6af10b1307d9fdf35552eeee9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7705,7 +16840,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7718,9 +16853,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "e48f9da3611ec12be2ac1d17826b6a65a134a3e7f3c5101b4d1961ccf182ff29", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "e29c3356a2500f376c9bf8a3293a1b08b7abe138f79408f814658666e4be5b59", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7788,7 +16922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7801,9 +16935,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "216e8f52552e5156288f85a82ddb9ee95acdeac42e24be8896889de9c14785cf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "30c631e69db4c3d4dd9ef7d5efb3aa2fbc058d8e6409b021c7264f3023bcd8cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7871,7 +17004,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7884,9 +17017,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "40de19e18295201f77feeca58ac8fc73a35a3a20e444dd68f3af137384555bcc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "8f7ef98484ecdb6e16977d707cce581e07059bb073c15af060d1f49ff54936cb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7954,7 +17086,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7967,9 +17099,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "9f4a2d439ef5608eb43e4683d8405e20a8231d5e05eb6105636f3b33bcdd94e3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "291c2f215974dec6ac1b5518a2ff145070ae32b1748182babec9c28c9732cf8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8037,7 +17168,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8050,9 +17181,90 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "1dff5de9e3c79ee18c9593162a23ff5f19b2074735c0c6d68dd979296ced197d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "4973310b9ec962fd2f5585ed23cf0d1c45235c30df7ebecaa8147710a43d2c9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "91563530f04990e31c8cd5d7ccee5cdcea655c1b4ecfd73cd0683b46d48521f6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8110,7 +17322,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 +, /* mUsePerTokenSfB */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 @@ -8120,7 +17332,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8133,9 +17345,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "53c5fcfb99477cd81d8da10033146f17fb2a7b886d7824e412127c1ced2c1865", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "e7763784a95c16a64f62e8ad4469f40917ded2dad062e9ce6b4885988890db36", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8188,7 +17399,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 512 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -8203,7 +17414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8216,9 +17427,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "fa0dd0b52f880b99f0e2bed59ca0ac46251adf291608aadf2cefd79886c8ce88", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "afaf369c24d661e85cbcb99c65752567f91fe54cc6ce8cda0cb36be3f1ed1eb9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8276,7 +17486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 , /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 +, /* mUsePerTokenSfB */ 1 , /* mUseTmaStore */ 1 , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 @@ -8286,7 +17496,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8299,21 +17509,1004 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "dd40bdc871a1fb89f5ce777833f49db5168e5a418ce9affd81779d4120148466", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "aa3bc06ff914d5fa7baebb60f104cc02d5306a48bc5d596c24df457a57b7bbe5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 , /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "f27509119e9341e333e243743fbc42346f1aa8953d512a42b9c2b4826b5b52bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "cab7e0887c19d49dfb7823e8eff72ca08ccf283fb6b90ee50ec5d67cca6b178c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "06591472d49fd68f8fc88a436ccd0cd7cb2ea3b66437e6c47bf1b943deea0124", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "a1820b52790c9453b1a8426325ddf68aa1ee0aa4aa14c5585f3898c61fedfa03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "7fb5c548a1d2e94f8ffcf61889b4317f349b5572cab8faf3a7347797c186543b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "f1288f5ca6fc863004f946834eab0e1812187ed532e55ef011fdde8dec07a38e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "ac3a07b6adf6965ee43d451b7008757c9f827529947bd6dddf1132c34e1de5d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "3d6a2c284b71ab1a9e4bbc1f14ba18aed44838b0d60f5de7a3442e771c4d6f39", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "b8eaf9fa419b40b1ff9f095abd428d2a63e934a18643b3b0bc5303fd986e1fbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "a3eee3ebb194fe171e30e320b973cd9771bcb84a008061d3215af237a2b6c417", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "8e0362ce10207421300589dc3793978c0157ed59ac63afc3a67410577f854538", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "5a9eb70094903767ca8fb09c5b3c907c48e428146ab3f8e693fbc649a1634d3c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 , /* mEpilogueLdtmDps */ 16 @@ -8333,7 +18526,581 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "444d88d4a67d0ccaa3884b5e42c9ddf010c3adb9711e745604accd51671ee589", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "4cb907d0a2a145e769658234d551c3d94d8c449fa74c52bca210337c4c670302", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "0520b88c899688b49f740cfa9a7a268f541ecfc26264257d59bbb9b116f84013", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "b565edc3dd1c735a12b19803fe13c1721d2a7f2f9c18d9ce8570e6a37a3771ad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "da16b182217b177236df0365b40f2d0a3c118a6d07a9d02ef99a3f33c4245ca0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "03548184b958f9911abc4230146e367d4c1a373ddd898a37e75f47adaa52f2dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "6feca0e87cd971c33d0e893193dd779a70effdc80457b4e6b234ee7307b358a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -8364,27 +19131,26 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) , /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) , /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 +, /* mFusedAct */ 1 , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "b4890c8cb043c6feebbd9cdc40fdb91314dd47d747e2e6ae18143a6b3b282450", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "c5ed3ae4029acd951a85d5f8677238ba3dd49d589fecbaf578017e60a1651eb7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8452,7 +19218,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8465,9 +19231,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "a0caa4f383910792fdb1fc3e7db17b7f2a8025a2a9ff5be621485321ff0f433d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a", 448, "eb3c978b692e8a741f87252d19b919b9a41f5d5f668928be1f87a580afb6d360", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8535,7 +19300,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8548,9 +19313,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "1a2c81269bc212521650e4be0a3b1442bcc8186ee39158339f40f1d2b6c0c474", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "0021520de6ed1a41176410ce74fc6daf2479d59f896d129bf261cdf77d73390c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8618,7 +19382,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8631,9 +19395,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "0cd7c32083e34e9f66e3b2902771e25da1b092ccd76dde3410d0aa7dfa9ace45", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "5466aebbc701240fbdec082f2c221da33724e36e9cc37244e1c663351237d3e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8701,7 +19464,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8714,9 +19477,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "12334edbcc35771adaab31c1d686604741fe698dcb74af3126b78095f3c9c60f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "799f43658600a41c4e2818cb4b7b4bbc21671579203965fa7c22bba8807bd20d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8784,7 +19546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8797,9 +19559,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "577604c36f4fe236191cf1b4977330568cc02c612e954fd3a7a2cd88b87e75c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a }, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a", 224, "c84c730baba847e750182a60ae41edaf2f9a0ea0411edd2c026c9e12947edab1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8867,7 +19628,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 +, /* mClampBeforeAct */ 1 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8880,8 +19641,4599 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 -, /* mUseTmaOobOpt */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "c2cfbc6044bd82ef42dfe764e9c643a45425d772accdc03b5d8377fa6d632cf8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "3fde5d47184b17a6af71422d4daf94641e796f735a8a7693bb9e94100204c173", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "29837084d85e9b1c9fbc8dd727b650f97e9410ef263c0691fb2ad72f275b0811", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "07707a930b5a3993c338e678c6ea245a7a26a36872fa2b42da288a762b68daae", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "1632e3669c9c1476469d702551d80408462a570d59133ddea4038b15aa4bdd92", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 123904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "02f8829ceeef6300573b6b3f1df1039f3562e553902bacf07f40dd3afb67fd14", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "50de36ba81043b68eb60f5cfcf788971fe1f0baaac95a5dd530d8414279cf23d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 161792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "052f40c1bc9596a042e6f0c67323329dc7e14db3a124a480cb51c6eee6cdf49d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "96b6e482109df2f684c6e557e881e31e596ce94834e91498c63cb5e64b6243f5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "d9bd6253fb5083a8affbe592d8de646de77f4a9747df3a34cc4396b512bdfcde", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "55ad7ffe4032ba284a929b0a7d6212b612722fe006f0e4441b5205d8de103b24", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "e6ce1fb072df08f8d12fe96d53796c78bdb836ee058f9bde7b5ced2278fc90c1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "0d577bd2ee65de048e5250f3c7d9128aef0db168ff7d231fbd832fc122cab47a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 138240, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "40628f518c6ec0bfb6134def0ad8198d2faff2ae309099bee4a35763818016ba", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "a74ed7009c8c4033fc0f02a2ac0ea2464a8829141eb7433e1a4d9d3d8b692d82", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 181248, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "515a9a9f96a1fb15d4662c2cc9c87525c5da9b6bb268b6f8c2fe0b37623a97e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "934e0e149a2b20c7cec7a15916bd15d21074ee9d3855fd3f1c18a411ac6c657b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "97e4193bc73ba4834bd92d6b938040d0cc77edfc2385931636fe26ca735b3bc3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "14887d0f230c05da6b55e19b2605115c1585429fa7e034546ad20cb212b340c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "344a897c696a28b14b4e452e9b190da1a4b9f6cbd9ca41472eeec348dfeaa9d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cf26f5247795cb73af7165eace750a99d1f69904fc05c65aab0a7cdbc53b36c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 168960, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "e86aad17b6dd651f5599b02f794c83efb4c78005736bf0fad5e237eb85c05033", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6d0836d883e8fa30cc6dc3e1dbfdbea14e459d47d3387ccea49e05cedbc2e811", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "6e02f2ec03e0c7264b291928e5c63b2e5ef1f85441757f10bf28242cab842625", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "f665ef60036d58f1875c6630fd3c36b4a4690fb6da50f8445ae134b82ea04e68", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ca51982e110248abd0d4bdfbb70c6d76d1fbc23036cc441581de5b943ee5198d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "db78c26c8ea7e5bfa589444923b52abfa2a7a8a0351159e049c9267a738ea987", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "953c318c76da83038f52b569de1c06088d3306f12a7b479bbffdb2245735d732", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "5e2434ec5d835069d5f781550ec344ccbed4989bfa925c8d4697e33d0f38cc74", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ec444c21c52a58becd398afbcf546418d9e114daf3f8a8dce715bd9397a857ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 159744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "24b9797e2a03dac8ef9db0cb7a3f29e0665d0f3195bdc12d3e97301a6b763373", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 151552, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ec3f5bb314b51d1c68dfd5676b56744bcfce161e13ec56118daa6b37dfec6b5b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 195584, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6f9aa51781f908a895e69467f9a498b9ddf0f8ca4eb3ff7588c5d716431aaca6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 187392, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ba484fdf619a6072f398a18c3cc4c5dd01c4068c55b08e364b9122c5233e2cd7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "dcba9f175b64e4998dfe660c72a3a076c8da9288b225256d3792dde4c56c1a48", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "b225fbe8816438e48428b707e2ba6a205c67acc4adcb01a118bda00418ff406c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "2b2ec120e9059aea37f17bd0f345173ca19ef3ddb7f0769a441c4672b01eaf08", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 152576, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "2462903bfe90426892b69393811f8ce0f3209794e1512164c0cd8329d6a85cdb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cf97903c46cbb0c86bc8b80d195304e94adf293ad8f75208749a5c2dc45e4e4a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "9aa4db4b49aad5da17d2379ca887aa9b07a9d0391953ac211001cee1b234a73c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "92f4423ee6acb3636ac296af153218f1c42f5ef1d805a11fa58acc9944700c8e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "465b71c6f744c338beb22fa0bfaa9eee39cdb9acb7651544a9a2294dcb2d453e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 159744, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "f54e9c36437932b25597cf9d7ccf7a29c9724bc4ae0309079cf4d2e29d9a6dce", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 151552, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "a45e0c9705504596a77df89917e740e6a1cde5b343e0514573b42ea196086375", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 4 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 195584, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "8ef260af8dbd618bbaff3e5a0915e90b074d0ce8a4c6c956ce37049b0200aa02", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 187392, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "619863210983a9841e0c7bf748ec2b8f32d99b14e4fc25f202fb1fbf76486601", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "e9966f2eb459f6516f279dd530a740da25b97f98306426862882ef261994b178", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "e2d318df8eb6a40e08a341ac6af1e10481702ca8fc9170f0443a96f4b6d490e8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "9633c5cbaf70db0ee900e52ea2eac701cdbf2e3e0f7808e153ef2a70e89c6d97", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "a130653f7ad081a498b99bd007ed51be3c7cb16abffa071b91e36502a3307a65", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4fb3656258c3d98d3f5fd6969a4209d63e93239b41ca980b04debe2e53106b02", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "7f4692ea596508b3ef3db1f36f678f04089df32e3288354bf5c2c136bf65c188", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4e4ea1a60167567c6bfb4158863a37128d287aa9b36b375d58faee8219a56171", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "3a7a0740f2e48c242086cb7af3f8b88a21edd1dd75e44a2bc555d430d02158a9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 231424, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "3ae741f0da39d883451c619222bc39b1fc4c9cb314609a7dbb77b3efb9e1cd89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "8197034c8314acff20cee4599db0d371f4096651380693fbce04ecb097efe96d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(1) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 2 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826828) +, /* mDtypeB */ trtllm::gen::Dtype(17827853) +, /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(5) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 2 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(0) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 1 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 1 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 + }, gemm::SmVersion::Sm100a }, #endif // EXCLUDE_SM_100 }; // clang-format on diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index 79b96109c1..c67f4feaf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -18,7 +18,6 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/SfLayoutDecl.h" -#include #include "BatchedGemmEnums.h" #include "Enums.h" @@ -52,7 +51,11 @@ namespace tg = trtllm::gen; namespace KernelParamsSetup { #ifdef TLLM_ENABLE_CUDA - +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Member functions. +// +////////////////////////////////////////////////////////////////////////////////////////////////// enum class MatrixType { MatrixA = 0, @@ -60,38 +63,6 @@ enum class MatrixType MatrixC }; -////////////////////////////////////////////////////////////////////////////////////////////////// -// -// Utility functions. -// -////////////////////////////////////////////////////////////////////////////////////////////////// - -template -bool useTmaOobOptA(BatchedGemmOptions const& options) -{ - return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM && doesRouteImplUseNoRoute(options.mRouteImpl) - && options.mUseTmaOobOpt; -} - -////////////////////////////////////////////////////////////////////////////////////////////////// - -template -bool useTmaOobOptB(BatchedGemmOptions const& options) -{ - return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchN && doesRouteImplUseNoRoute(options.mRouteImpl) - && options.mUseTmaOobOpt; -} - -////////////////////////////////////////////////////////////////////////////////////////////////// - -template -bool useTmaOobOptC(BatchedGemmOptions const& options) -{ - return options.mUseTmaStore && options.mUseTmaOobOpt; -} - -////////////////////////////////////////////////////////////////////////////////////////////////// - // Create the TMA shape/stride for A/B/C. template static auto makeTmaShapeStrideAbc( @@ -102,83 +73,60 @@ static auto makeTmaShapeStrideAbc( bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput) || (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput); - // Whether to use TMA OOB trick to block out padded dummy tokens and saving BW whenever no routing - // is involved. It applies to batchM and matrixA, or batchN and matrixB, or any case for matrixC. - bool const useTmaOobOpt = matrixType == MatrixType::MatrixA ? useTmaOobOptA(options) - : matrixType == MatrixType::MatrixB ? useTmaOobOptB(options) - : matrixType == MatrixType::MatrixC ? useTmaOobOptC(options) - : false; - // The outer dimension. auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN; // The outer dimension tile size. - auto ctaTileNumTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? tileM : tileN; - // The outer dimension of TMA box shape. - auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM : ctaTileNumTokens; - + auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM + : (matrixType == MatrixType::MatrixA) ? tileM + : tileN; // The inner dimension. auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK; // The inner dimension tile size. - auto ctaTileHiddenSize = (matrixType == MatrixType::MatrixC) ? tileN : tileK; - // The inner dimension of TMA box shape. - auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : ctaTileHiddenSize; + auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK; - // Swap matrix C sizes if output is transposed. + // Swap matrix C sizes if output is transpose if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) { - std::swap(numTokens, hiddenSize); - std::swap(ctaTileNumTokens, ctaTileHiddenSize); - std::swap(tileNumTokens, tileHiddenSize); + numTokens = mN; + hiddenSize = mM; + tileNumTokens = options.mEpilogueTileN; + tileHiddenSize = options.mEpilogueTileM; } // For a fused activation kernel, the hidden size of output is halved. TODO: That's true for // gated activations but not regular activations. - if (options.mFusedAct && matrixType == MatrixType::MatrixC) + if (options.mFusedAct) { - hiddenSize /= 2; - tileHiddenSize /= 2; - ctaTileHiddenSize /= 2; + if (matrixType == MatrixType::MatrixC) + { + hiddenSize /= 2; + tileHiddenSize /= 2; + } } // The cute tensor shape for A/B: (numTokens, hiddenSize). // Note that TMA descriptor expects the first dimension's stride to be // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - - // Activations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K). - std::vector shape = {static_cast(hiddenSize), static_cast(numTokens)}; - if (useTmaOobOpt /* also implies input/output activation */) + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + // If the matrix is a weights matrix, we use 3D logical shape for it (B, M, K) or (B, N, K). + // Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K). + if (isWeights) { - // If TMA OOB optimization is used, we use 3D logical shape (M, tileM, K) or (N, tileN, K). - // The outer dimension is extended to make room for the possible counterbalance positive - // offset from the middle "bound" dimension. The counterbalance should be no more than - // ctaTileNumTokens. - shape = {static_cast(hiddenSize), static_cast(ctaTileNumTokens), - static_cast(numTokens + ctaTileNumTokens)}; - } - else if (isWeights) - { - // If the matrix is a weights matrix, we use 3D logical shape (B, M, K) or (B, N, K). - shape = {static_cast(hiddenSize), static_cast(numTokens), - static_cast(options.mNumBatches)}; + shape.push_back(static_cast(options.mNumBatches)); } // Assemble the stride (strideTokens, 1). // Swap the first two dimension as mentioned before. - std::vector stride = {1, static_cast(hiddenSize)}; - if (useTmaOobOpt) + auto stride = std::vector{1, static_cast(hiddenSize)}; + if (isWeights) { - stride = {1, static_cast(hiddenSize), static_cast(hiddenSize)}; - } - else if (isWeights) - { - stride = { - 1, static_cast(hiddenSize), static_cast(hiddenSize) * static_cast(numTokens)}; + stride.push_back(static_cast(hiddenSize * numTokens)); } // Assemble the box shape std::vector tileShape = {tileHiddenSize, tileNumTokens}; - // Alternate layouts (MajorMn and BlockMajorK) do not apply to matrixC + // Alternate layouts do not apply to matrixC if (matrixType != MatrixType::MatrixC) { gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; @@ -348,8 +296,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc for (int b = 0; b < options.mNumBatches; b++) { - int mM = batchM ? options.mBatchedM[b] : options.mM; - int mN = batchM ? options.mN : options.mBatchedN[b]; + int mM = batchM ? options.mBatchedM[b] : options.mN; + int mN = batchM ? options.mM : options.mBatchedN[b]; // Skip Tma descriptor creation if expert isn't used if (mM == 0 || mN == 0) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h index 2dfb0a1894..3a3a3ec6ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h @@ -1,3 +1,4 @@ + /* * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 @@ -18,7 +19,6 @@ namespace batchedGemm { - // This is device code struct KernelParams @@ -32,55 +32,6 @@ struct KernelParams // Maximum number of CTAs static constexpr int MaxNumCtas = 2048; - // NOTE: TMA out-of-bounds optimization for MoE padded tokens: - // - // Originally the padded tokens is a 2D tensor [hiddenDim, ctaGridDimY * tileN] with stride [1, - // hiddenDim] and box size [tileM, tileN] at pointer p. We waste bandwidth bytes since we only - // want to load [0, batchEnd) out of the [0, tileN) box size: batchEnd is a runtime variable while - // box size needs to be fixed at compile time. - // - // To deal with this, we reshape the tensor to 3D: [hiddenDim, tileN, ctaGridDimY * tileN] with - // stride [1, hiddenDim, hiddenDim] and box size [tileM, tileN, 1]. For the original 2D - // tensor, - // - // Offset Coords [ : , ctaIdxY * tileN ], - // Box Sizes [ : , tileN ], - // Coords Range [ : , ctaIdxY * tileN : ctaIdxY * tileN + tileN], - // - // while we only want load the range [ctaIdxY * tileN, ctaIdxY * tileN + batchEnd), 1 <= batchEnd - // <= tileN - // - // For the reshaped 3D tensor, - // - // Offset Coords [ : , tileN - batchEnd , - // ctaIdxY * tileN + batchEnd ], - // Box Sizes [ : , tileN , - // 1 ], - // Coords Range [ : , tileN - batchEnd : min(tileN, 2 * tileN - batchEnd), - // ctaIdxY * tileN + batchEnd : ctaIdx * tileN + batchEnd + 1], - // - // while min(tileN, 2 * tileN - batchEnd) always evaluates to tileN. The unwanted tokens are - // essentially filtered out by utilizing the OOB feature of TMA. Since the 2nd and 3rd dimension - // has the same stride, we end up loading the following (adding the left and right end of the 2nd - // and 3rd dimension ranges): - // - // Effective 2D Coords Range - // [ : , tileN + ctaIdxY * tileN : tileN + ctaIdxY * tileN + batchEnd], - // - // This is exactly the same as the original range except for the offset tileN, thus we also need - // to offset the pointer in the opposite direction: - // - // Ptr (p) -> Ptr (p - tileN * hiddenDim) - // - // Due to the restrictions of TMA unit, the above operations requires the TMA descriptor and the - // underlying buffer be constructed differently: - // - Requires valid buffer at (p - tileN * hidden) - needs prepending `tileN` tokens. - // - TMA outermost dimension must be extended by `tileN` or loads will OOB in the rightmost side. - // The latter is because when batchEnd == tileN, the offset coords in the 3rd dimension becomes - // ctaIdxY * tileN + tileN. When ctaIdxY = ctaGridDimY - 1, it becomes ((ctaGridDimY - 1) * tileN - // + tileN = ctaGridDimY * tileN which is equal to the 3rd dimension size and will be filtered - // out. That's why we need to extend the tensor size by tileN. - // // TMA descriptor for A. // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from // makeTmaShapeStrideAbc. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index f6c8b18092..616383f6a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -20,7 +20,6 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" #include -#include namespace batchedGemm { @@ -78,38 +77,6 @@ public: } // Returns the offset of the ith chunk - int32_t getChunkOffsetByName(std::string const& name) const - { - for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) - { - if (mSmemChunkNames[ii] == name) - { - return getChunkOffset(ii); - } - } - throw std::runtime_error("Name not found: " + name); - } - - // Returns the first chunk reuse flag given chunk name. - int getFirstChunkReuseFlagByName(std::string const& name) const - { - for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) - { - if (mSmemChunkNames[ii] == name) - { - return getFirstChunkReuseFlag(ii); - } - } - throw std::runtime_error("Name not found: " + name); - } - - // Function to calculate the total size of the SMEM array - int32_t getTotalSize() const - { - return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); - } - -private: int32_t getChunkOffset(int32_t ii) const { if (mFirstChunkReuse[ii]) @@ -124,6 +91,12 @@ private: return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); } + // Function to calculate the total size of the SMEM array + int32_t getTotalSize() const + { + return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); + } + // Returns the first chunk reuse flag for the ith chunk. int getFirstChunkReuseFlag(int32_t ii) const { @@ -166,7 +139,9 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) { if (mmaKind == tg::MmaKind::Auto) { - throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); + std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl; + assert(false); + return -1; } if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) { @@ -566,14 +541,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); + return traits.mSmemAllocatorHelper.getChunkOffset(0); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); + return traits.mSmemAllocatorHelper.getChunkOffset(1); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -587,63 +562,64 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); + return traits.mSmemAllocatorHelper.getChunkOffset(2); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); + return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); + return traits.mSmemAllocatorHelper.getChunkOffset(5); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); + return traits.mSmemAllocatorHelper.getChunkOffset(6); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); + return traits.mSmemAllocatorHelper.getChunkOffset(7); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBias(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); + return traits.mSmemAllocatorHelper.getChunkOffset(8); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); + return traits.mSmemAllocatorHelper.getChunkOffset(9); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); + return traits.mSmemAllocatorHelper.getChunkOffset(10); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); + // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -654,28 +630,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); + return traits.mTmemAllocatorHelper.getChunkOffset(0); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); + return traits.mTmemAllocatorHelper.getChunkOffset(1); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); + return traits.mTmemAllocatorHelper.getChunkOffset(2); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); + return traits.mTmemAllocatorHelper.getChunkOffset(3); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h index f15f246f81..a5cb3ab953 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h @@ -181,8 +181,6 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st if (result != CUDA_SUCCESS) { - char const* errorString; - cuGetErrorString(result, &errorString); std::stringstream ss; ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; @@ -285,10 +283,8 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - char const* errorString; - cuGetErrorString(result, &errorString); std::stringstream ss; - ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; + ss << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json index d54e8a3861..0394408dea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json @@ -12,6 +12,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 4, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -30,7 +31,8 @@ "sfLayoutC": "8x4", "batch": "N", "numExperts": 128, - "useCudaGraph": true + "useCudaGraph": true, + "clampLimit": 2 }, "BatchedGemmPerTensorScalingFp8LowLatency": { "dtypeA": "e4m3", @@ -44,6 +46,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, + "numStagesMma": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -60,7 +63,8 @@ "gridWaitForPrimaryB": true, "batch": "N", "numExperts": 128, - "useCudaGraph": true + "useCudaGraph": true, + "clampLimit": 2 }, "BatchedGemmDeepSeekFp8LowLatency": { "dtypeA": "e4m3", @@ -94,7 +98,123 @@ "numStagesMma": 4, "batch": "N", "numExperts": 128, - "useCudaGraph": true + "useCudaGraph": true, + "clampLimit": 2 + }, + "BatchedGemmMxE2m1E4m3LowLatency": { + "dtypeA": "mxe2m1", + "dtypeB": "e4m3", + "dtypeC": "e4m3", + "dtypeMmaB": "mxe4m3", + "mmaM": 128, + "mmaN": 8, + "mmaK": 32, + "tileM": 128, + "tileN": 8, + "tileK": 512, + "epilogueTileM": 128, + "epilogueTileN": 8, + "numStages": 3, + "numStagesMma": 1, + "numSlicesForSplitK": 1, + "useTwoTmaLoadWarps": true, + "clusterDimX": 1, + "clusterDimY": 1, + "clusterDimZ": 1, + "sliceK": false, + "transposeMmaOutput": true, + "useShuffledMatrixA": true, + "useDeepSeekFp8": false, + "useTmaStore": true, + "useCustomMmaSchedule": true, + "gridTriggerSecondaryB": true, + "gridWaitForPrimaryA": false, + "gridWaitForPrimaryB": true, + "sfLayoutB": "8x4", + "sfLayoutC": "8x4", + "batch": "N", + "numExperts": 128, + "useCudaGraph": true, + "biasType": "m", + "act": "swiglu", + "clampLimit": 2 + }, + "BatchedGemmMxE2m1MxE4m3LowLatency": { + "dtypeA": "mxe2m1", + "dtypeB": "mxe4m3", + "dtypeC": "mxe4m3", + "mmaM": 128, + "mmaN": 8, + "mmaK": 32, + "tileM": 128, + "tileN": 8, + "tileK": 512, + "epilogueTileM": 128, + "epilogueTileN": 8, + "numStages": 3, + "numStagesMma": 1, + "numSlicesForSplitK": 1, + "useTwoTmaLoadWarps": true, + "clusterDimX": 1, + "clusterDimY": 1, + "clusterDimZ": 1, + "sliceK": false, + "transposeMmaOutput": true, + "useShuffledMatrixA": true, + "useDeepSeekFp8": false, + "useTmaStore": true, + "useCustomMmaSchedule": true, + "gridTriggerSecondaryB": true, + "gridWaitForPrimaryA": false, + "gridWaitForPrimaryB": true, + "sfLayoutB": "8x4", + "sfLayoutC": "8x4", + "batch": "N", + "numExperts": 128, + "useCudaGraph": true, + "biasType": "m", + "act": "swiglu", + "clampLimit": 2 + }, + "BatchedGemmMxE2m1Bf16LowLatency": { + "dtypeA": "mxe2m1", + "dtypeB": "bf16", + "dtypeC": "bf16", + "dtypeMmaA": "bf16", + "dtypeMmaB": "bf16", + "mmaM": 128, + "mmaN": 8, + "mmaK": 16, + "tileM": 128, + "tileN": 8, + "tileK": 256, + "epilogueTileM": 128, + "epilogueTileN": 8, + "numStages": 3, + "numStagesMma": 1, + "numSlicesForSplitK": 1, + "useTwoTmaLoadWarps": true, + "clusterDimX": 1, + "clusterDimY": 1, + "clusterDimZ": 1, + "sliceK": false, + "transposeMmaOutput": true, + "useShuffledMatrixA": true, + "useDeepSeekFp8": false, + "useTmaStore": true, + "useCustomMmaSchedule": true, + "gridTriggerSecondaryB": true, + "gridWaitForPrimaryA": false, + "gridWaitForPrimaryB": true, + "sfLayoutB": "8x4", + "sfLayoutC": "8x4", + "batch": "N", + "numExperts": 128, + "useCudaGraph": true, + "biasType": "m", + "act": "swiglu", + "patchF2fp": true, + "clampLimit": 2 } }, "configs": [ @@ -221,6 +341,7 @@ "_template": "BatchedGemmPerTensorScalingFp8LowLatency", "routeAct": true, "fusedAct": true, + "usePerTokenSfB": true, "useUnrollLoop2xForMma": [true, false], "dtypeC": "e4m3", "numTokens": 2, @@ -243,6 +364,150 @@ ["static", 1], ["persistent", 2] ] + }, + { + "_comment": "MxFp4xFp8_FC1", + "_template": "BatchedGemmMxE2m1E4m3LowLatency", + "routeAct": "ldgsts", + "fusedAct": true, + "sfLayoutB": "linear", + "useUnrollLoop2xForMma": [true, false], + "numTokens": 2, + "numExperts": 2, + "mmaN,tileN,epilogueTileN,tileK,numStages": [ + [8, 8, 8, 512, 3], + [8, 8, 8, 256, 5], + [16, 16, 16, 256, 5], + [32, 32, 32, 256, 5], + [64, 64, 64, 256, 4] + ], + "tileScheduler,numStagesMma": [ + ["static", 1], + ["persistent", 2] + ] + }, + { + "_comment": "MxFp4xFp8_FC2", + "_template": "BatchedGemmMxE2m1E4m3LowLatency", + "routeAct": false, + "fusedAct": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "mmaN,tileN,epilogueTileN,tileK,numStages": [ + [8, 8, 8, 512, 3], + [8, 8, 8, 256, 5], + [16, 16, 16, 256, 5], + [32, 32, 32, 256, 5], + [64, 64, 64, 256, 4] + ], + "tileScheduler,numStagesMma": [ + ["static", 1], + ["persistent", 2] + ] + }, + { + "_comment": "MxFp4xMxFp8_FC1", + "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", + "routeAct": "ldgsts", + "fusedAct": true, + "sfLayoutB": "linear", + "useUnrollLoop2xForMma": [true, false], + "numTokens": 2, + "numExperts": 2, + "mmaN,tileN,epilogueTileN,tileK,numSlicesForSplitK,clusterDimZ,numStages": [ + [8, 8, 8, 512, 1, 1, 3], + [8, 8, 8, 512, 2, 2, 3], + [8, 8, 8, 256, 1, 1, 4], + [8, 8, 8, 256, 1, 1, 5], + [8, 8, 8, 256, 1, 1, 6], + [8, 8, 8, 256, 2, 2, 4], + [8, 8, 8, 256, 2, 2, 5], + [8, 8, 8, 256, 2, 2, 6], + [16, 16, 16, 256, 1, 1, 3], + [16, 16, 16, 256, 1, 1, 4], + [32, 32, 32, 256, 1, 1, 3], + [32, 32, 32, 256, 1, 1, 4], + [64, 64, 64, 256, 1, 1, 3], + [64, 64, 64, 256, 1, 1, 4] + ], + "tileScheduler,numStagesMma": [ + ["static", 1], + ["persistent", 2] + ] + }, + { + "_comment": "MxFp4xMxFp8_FC2", + "_template": "BatchedGemmMxE2m1MxE4m3LowLatency", + "routeAct": false, + "fusedAct": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "mmaN,tileN,epilogueTileN,tileK,numSlicesForSplitK,clusterDimZ,numStages": [ + [8, 8, 8, 512, 1, 1, 3], + [8, 8, 8, 512, 2, 2, 3], + [8, 8, 8, 256, 1, 1, 4], + [8, 8, 8, 256, 1, 1, 5], + [8, 8, 8, 256, 1, 1, 6], + [8, 8, 8, 256, 2, 2, 4], + [8, 8, 8, 256, 2, 2, 5], + [8, 8, 8, 256, 2, 2, 6], + [16, 16, 16, 256, 1, 1, 3], + [16, 16, 16, 256, 1, 1, 4], + [32, 32, 32, 256, 1, 1, 3], + [32, 32, 32, 256, 1, 1, 4], + [64, 64, 64, 256, 1, 1, 3], + [64, 64, 64, 256, 1, 1, 4] + ], + "tileScheduler,numStagesMma": [ + ["static", 1], + ["persistent", 2] + ] + }, + { + "_comment": "MxFp4xBf16_FC1", + "_template": "BatchedGemmMxE2m1Bf16LowLatency", + "routeAct": "ldgsts", + "fusedAct": true, + "sfLayoutB": "linear", + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "tileK": 256, + "mmaN,tileN,epilogueTileN,numStages": [ + [8, 8, 8, 3], + [16, 16, 16, 3], + [32, 32, 32, 3], + [64, 64, 64, 3] + ], + "tileScheduler,numStagesMma": [ + ["static", 1], + ["persistent", 2] + ] + }, + { + "_comment": "MxFp4xBf16_FC2", + "_template": "BatchedGemmMxE2m1Bf16LowLatency", + "routeAct": false, + "fusedAct": false, + "useUnrollLoop2xForMma": [true, false], + "dtypeC": "bf16", + "numTokens": 2, + "numExperts": 2, + "mmaN,tileN,epilogueTileN,numStages": [ + [8, 8, 8, 3], + [16, 16, 16, 3], + [32, 32, 32, 3], + [64, 64, 64, 3] + ], + "tileScheduler,numStagesMma": [ + ["static", 1], + ["persistent", 2] + ] } ] } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4f5fc68a82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eba4edca5eaa1fc6b654c4b720339cd536a02723cc798fc17cb31314a1681633 +size 701588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2d413cf109 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20ad15a9f6be1c021baf23f4f24154c22a05ce90d26c631dc21bf53e0a489174 +size 581311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ea60080697..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:815661113030c8580cc963c443afc3c82fff7c8f8dd8a0ed98f95a08a91f619a -size 684616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7398382e17..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75593264df3d37f7d23664c02e426196a5f7ee1cc6de76db84895fca4e706c97 -size 562811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..88b1a44ef3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de230b7b4bcfec7b5100f8ddbe05f3789577a38767c314cc22554a0b4463275 +size 722952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..17fb2c8ac0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bcb2a1f2c500a4755940bc5803c9e7ac0a3987d671ff705599849625339dd0a +size 603467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6ab10b4e46..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:11afdd6753faeffe1446abed54c61ccf0e190ff886fc0270d055a4dbce9a9298 -size 705390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1be82f2378..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e4ecf7abe90d1a7f46a1ff3005cad8ebe0de8e7cf89862cf993f228a98ea00d -size 582747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..8fee1640f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a2bd3f71c60360ac8aabb9a70e96f69e7ce9cb8de89c36fc10786cc47f5eb7 +size 684024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9e086ef147 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cc810d34a7ad8dfdf60f3708158afd3a1743528d6788dca91868bc86c66845 +size 567153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ad367f3b91..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e921f66e6adc5292b99df5cfbae4a9cbae6182c8e99bbc83ea30bd1ca8ed8f55 -size 667892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index fd64e17181..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b6994776b67405b359d471fa5c873efa4714dd71397ddfd82a52d59cbf20a9a -size 550035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6fd76dc2f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0252fbcd6f38bcd3e39e07cdabcf7776a4089745290caeac27b6d446ba1cb46e +size 717132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..205923904f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b588e415053600a90d38211bcb5a969471b0e5e6c3437d78254d236129a270 +size 593303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0a76b6bb5c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec5cf07ec8b7a4405c305fb82f9eb7179a4a43ab14a2eacfadc35072b317cfd7 -size 700704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 5d763ed185..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6188f80d9ca7f95ea404a72de95bb8a38cace5dd8a8e76527fd83cf16aaff87d -size 575543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1320420c1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4be5d8b93abcd165428c9770e68286224d01f6624941ed534fb66bcc17344ab +size 743964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..145425781a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0482d8975738bd4e614dbb86745c5d49445d6a5c460d2245d3f17ce7bc992a1a +size 585703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 49e14cf3f4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b725a2ec74293ef928ade1206ebf2e3726f5980bc943f157372a414834d756fd -size 725020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 9f36031dff..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b2bd2855cc99dd074f24435265bfc32d0114a2d9e02ff98565c7881095674dc -size 566363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..8c8a035a95 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd31bc4f601149aba47f5e78fe197db66a83afde56071e8281818e1de727daa9 +size 765280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..70a68fc8af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a73ddc42c26585294145e543dedcf4140ff538044e5113aefed6dd5537cdaa2 +size 607019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4e3f4f0a99..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2be435473939c1f81601b61b372a37982b0aa0f107cd4778c201d160c4f8e43c -size 745004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c2e9f855cf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba80fab864884c31fdca7963a86df70f384e06e5ef76e904971543af60a05c06 -size 586347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ec7f0253d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01cf2fbe35943350cbbc9930f632af76b9f2fc1de61dda1f55bd97c9cb15792e +size 687034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1c17e9a855 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7e50ffcd72e58781a11e39f3d3de530214aaa25b9d8b78bc5326f5a17bec622 +size 570163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e9f6b4a0e4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdf20f912daa7f6e4580ead6b14f66c3aa0d70d536dfdb509ab06574f8dedcc2 -size 670656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 10a9a79555..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04085e7681246fbf85f61871704ceb68ec39dc3a2ed9a4c3b9855b8da6d6a0f6 -size 552255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..946a22fc74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb2d60b386a7d82f5076a78095ac9ca7a1431c082c0f11915ef602b72e16d2b8 +size 719944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..66ca1d88cb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d556fada7372596fd09b3e8a06f21c67b891f35890bef733bd278d4f746e578e +size 596263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index aeaeba607a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46d47463f5c2c2711468fde51b407b9e31fc7458cd98c2f2d79181037554591e -size 702972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c8d4342940..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58fb73ed28d0c7c1e0705fcade19faa88e7b310c4b97d0004c84ed52cc275cd1 -size 577763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..63925a8806 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4356863633b801a23e960dc4e9e424a69c7b1897a7bc97579e23504235d3e373 +size 718360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2f77b14eab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40476bd068ca9bbc4f7217fa9fc04b6b7fdecdb45eb1e9f62dcd566e55c7d07a +size 596605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0df83c1ed0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fed2ac3d4effc40881584f93f4ecf938d0081131cb7b5fa26519be7101d8b0a4 -size 700600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1159769f01..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:516bdac65a439b1ab5615912d573172c38792ab898e68f1b972253588767c398 -size 577315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ca97fb7eb2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f89b00b57cafb59f1b912d03ffb8037fba3cc614274d600a758f1597a4d0ab19 +size 737950 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..291e8d695e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16532048133f5f840f3823ecc343d3d5be65164341ae4f2900dba6901f50c588 +size 618760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index cd52416b52..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:31b8d4b7836e084a9b3f5e8a76d8257f33840162d26b385db5bfcccfd36333fa -size 719648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6b3ff0aedb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10607e6a0e33ca77881111bf04f7e190bab576f1987ecaf38298b96682a3f51d -size 598089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..209e291246 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da385287279c9383155405cd2344705c4607b7742466e8a545fef694bb75cb49 +size 696556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..32dac02b51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3273045650b7230c83758d9ccfd6cc06b52cbdd91624e50538481d447e48c509 +size 566955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ad9f8b93a8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28e437809e9158ec82047a8ff72248fe641c735eb6ebb50984b76fda16df32b2 -size 680966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 25893f8f2c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea3a2e628b52e2ab71dee5405da588b8735f5cd2d250dd2b106621c603bc4183 -size 549047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c4f238a9fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6f459a3ac169228f685c60db132055f25ff9ed8c3a8455be315858eb97d223a +size 729564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..42c070b6ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24e5cb1112172dfa0522f99de2d1cef6bfa50304b2d3c32fbde39bca36e92567 +size 593847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 5b31abadc4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cebb410d52fda0fef353941e34f1e9b09e152b6d801885d43f39b05ab7feecb -size 713284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index be9e44d083..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a01cadc451bde20a1dc1b2905f216468009401f5c230d8e3e172eb7c0e19a73e -size 574557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..32c5c67389 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d6ac9bea4823aa5e7f7efa696533e6d52405e5cfeba7732051e30789813eed +size 697586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9209ddf7bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d401d5ecb5ea7fe0a689c516a195e4eb325a1b2c6d38e2f0c49f712c923efce +size 576817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 654bdb4b14..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8025a48697e5c22bb2cd0d21023d7a554ef64cb8658f24f171c6c249b0104941 -size 680714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 93d0c09eaa..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2c747c157bc41de3c7e85f31cb39752451d5809476af277602d64c0b5a6cb27 -size 558317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..760c111f8f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f945903cacb99529e668335deaeb0cc9a72f6261e0c2f1f87e037c31d0dbf29a +size 719000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..01f493031f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9ca86c5b4528521ece2e2694187d67336e4b92a315d8fa3db9ed3fb9e4ea3aa +size 599761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 5218751f5f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22751baf77b6f7d97d3914458973a6ca03dfe01e7f9fd84e419903b133d82b16 -size 700746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7a4b66d1bd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3aa5c83701db0e0adf0f1727f454867a29c941587f1d6be2fa496143b5e768a1 -size 579089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1cd762d60d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b60a53ca988b19c6881800d3a8c03e3464e02644c9c8552bb8c9155defb5e3e4 +size 679480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..919940219e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:000feee3db5eba2d629e2c31599fa66807943f60e3205cd69c5c30e418a8a7ee +size 563497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..fcefda4d8f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4531ad2f99ffcb3326fe8c3312ac93f47dc3f9cc564ee8bd8cabc80766d79828 +size 560519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index addbcd1565..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70f3e4c712f6c1e7a887771473daea1f4ebdb3febb4ae8e0a8e8045ee867712b -size 663694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 735fb2326c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6d4e15ebd2d638df47afc4fcc58ff56ba8032d7352475cdae04c20195795557 -size 546377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 4c548b23a5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a4899f491141f32b13e27af6af56d729abaf5e0454cbd6f1a6a2f409ccb237d -size 542611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7a7044617e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf8748bf1007349c5fc8cf062ba90669680ec8fe3681ecb6174cb1da0548a7f +size 713030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..931c2f8534 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ad13b702274ebacb48c40948662d38f6c184b61b868f3c6ed062862c712b04 +size 589597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..8decd6ef88 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4310d5b81d090acdfa8481681b3310a6417b0b72e67ca6b7605a5766bd691296 +size 586619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e924aa915b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d8185e69e406c9e2cb2533f6c4f17ceba7c938642439634f265acad34cc56af5 -size 696010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6e1be3ef3b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f65dfb888be44aa2213e8a8a9dc4ff984e4224135df2e94241bb52ce60c19df -size 571097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 35bb5532e6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55285b3eaf9712c22ebc203ec690148dfb971f477710c0fc8139070573f138d8 -size 567331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..80651d9ca6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ebe7bcc1ee67e5c47ccc7c3fbf3f9fb11c9ee3df907cff85bcf9efdb06e473c +size 626970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ca1e2df502 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e4c7081a9e7f297a9406783919d035cad4e542e1ffdf261957a2a1c5c1807ae +size 505905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c7e99a3a1d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85420fddbd50cf79cc03a9f7b42957e853967e7995c768b874c38c23c317cd93 -size 621100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index cb07bf204e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55ef9504c71a596c687518e99a9de172baa0d407acaa438f810cc66b4ab03353 -size 497567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4ae14cc32f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef80a87cb0f3df7e0663a811eeb2acb5471d886e9b7c57e1d1700f39fefafdd1 +size 654748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d87858b561 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4af837c7a8837637f32715f5ec16bdf37d88edc5722d5e73464a215d3815512 +size 524459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 673479f0ee..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7d29557f9f4d2c5b259f81c0f26f9df5edc25f4029cbb0b4dbecf3e90e71b46 -size 648286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 74c02a10bc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2486fee1a77912dbb3ebba6d4c4f6418c4c1a90013f0f2a323ece5e844f2753 -size 516467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..087293c32b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d105ac221b0ab6dd54bb4d4a33e2816514a1d3c1f5de2bbfc6f33cf8e292b093 +size 634518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d831b0893a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a76c94d3c24f8a511c2bdbacf33e8b25d8b933cd2992764e20227cc4aa21ee2 +size 515179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 60a4705f69..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12e36aef6743d6ddd96ddb08418626fe5b3e4b83f663957386dec718d1325f8f -size 629388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6099b2ccd9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36846461c9cee973c8e43123da81b8ce68512821cdb013bd3ff7e2b47cc4a736 -size 507681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..67ecb2e60a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d67d3bd621138d2201abac84f8ffb2bf1903b674b23e254c4efca4a5e5825c7a +size 661508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f35a8659b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8279e11a9fef0ce15ac4cfdacfdf33796dbf85a14b5c76debdce5dc81b7e817 +size 534523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index dd379b98e3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba884722cb487cfa74838af9106919249ae4d0069cd7a76593f91644558f18d8 -size 654896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 62a08c6f2b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9fc47e97fff712a63a4837aaa4dfe2edbd4f8b3b6d0621d6478d9862b51156cb -size 523915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..23c9a0aa89 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59736e223d17e04ce5a3b11056f1cd6dcce0795705e778c4a40188129644ee4e +size 672406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c31c7e594b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4799e5b72cb1fc46ed6b3aeb38db6a0d3c682891c70e87b91095537f7e3f6e77 +size 551143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6ff1e45633..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15b965067f5d7bc01c722dd20399eb842a21e313040a47d5745bf6b305635ee6 -size 667276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 380089acb7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d6eb31c54ac57087ac4d1223fe88c62084b376ba9dfcbfc35032be1e179a2cd -size 543595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..78f0e6c2d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:789e8dbc977702753be3db385d1516e23c1a5424c9579032e41a93e5c19a3ff5 +size 700284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5b850c67f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6e7568ed077c12a3c4be7b9732b2b7595663005743824b6f80909a1cb6a4d1 +size 570487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 18e1bbfc02..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce665b7b39d4a3207eda877552a07913cb8e4cb7e53a9ed8833dcb550b45171f -size 699790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 861a696dc7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a024d6f97d8ca153bf21e7f0a8d510003a944fe2ef7c6c75c5645997acfe4e97 -size 560767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..1a4d136007 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d5b0a7a0efe303a49938f4ff502a097f79ac75935e2f152704d96fe26d573e6 +size 509235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4ecd68e269 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25a19c7fad0a8cc15939ac8b63ff3d9a05fb0940d0b3a6548bcb2feef79a4b31 +size 620748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..b814aa69e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d61944ff22f7dd643a192e7a9ad320ab5e0c7fe79329682bc58691033cb8c7ff +size 535401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 343c5c8761..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf7318cc054638cd294e5c66c7e58921bc2832f8fd862108dee360e9f9023ca0 -size 500899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a74bc8075e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14fec35af5f02b0e83c0fa75432ada6f733043ca2590084efa57d035dbb9ac0d -size 614925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4b63464bf4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36055fc1a963c09ebc4164c4c2190f408f273183898d85e708c2c0b1c559c22f -size 527063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..a7b4dea8ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93310393afe813c9dcf06d87968343e8fd1428daae69e73e2b7d70aa14140a7 +size 527839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..27f1267cd6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6336e5ef6dd633938f215fbe983d15a3191438011457b54c05bf68311eaeb9 +size 646652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..31da04bbd2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb670937e92ffad477d35995e010fffde6d3e8932693580a27dd3f146c584bf7 +size 553953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index f03fe89a4e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66c86d51a8b981965e5a82df05052b0f56ebdb2f9709393f7305237837e30b22 -size 518909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 042e648572..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:912c750c31ac443dc14b74bb80c61314c3535d7edcfe47a4426d55245c6adb16 -size 640830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2e7b8ffbba..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccca9bf30a12e0e7fe0f9d4f075ecca308b22bc3c52b633b98a9db5b2c87bcc4 -size 545221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..446bf71f3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf19af4ae1d2f1a573000d84e8d8ad160765af49b572a28555865a924c3fa650 +size 421811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..da7d90e5c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d17f90abba6e17234ae801179bd8fb4edc8ea164825f4336e62e3636a1f67e0 +size 357677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..b45c580dc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65281be492f09d45c994e8f1fe08c0cf9ebd085a4604e41a8836afbd10dd5a2e +size 353615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 07ef9af81e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18997fb7464d75fbddd5b974ebce6e947296216ec315098c67863fb18662c2f1 -size 419789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index f44bc59d7d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b71db6a57bdbe221e12aef06842bdc966d9ccedc9e068c49aa3c4b3c74c6d4c -size 352153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 24a68e306b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a2b59ffcbf4c1a6b4825182d589835b97318ab99fc2b76082154a0dfbe5208d4 -size 348089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0f3459c12a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b975ac0013dbd7545c991f9bd83823e58e580d4aeb1e4f06c3733cd12fdd8ff +size 443521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..871c9d9a0c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7050fc745946c7bdfa5a00c6240a53eb57db99ebc9ae4c131256e7987c91c05 +size 381805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..393e63fde6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed847484ea72cd2d6e2d1a36801fc08080ca65abdb68e5068d2637d35d1b824e +size 377743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4bc15d718a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bce226338e7a2675b25a2a899020d1b796e7c0cd325acc92d6ea510ad932ab3b -size 441203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7f6afaa068..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1c0555fa7853afe618da47f3c85a6a30969765c3dc15678f5a917e564617ccd -size 374849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 0528275bb6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:445be6bb2d548ccbad24cb54f9d7bf6bd623330c1097f2b722ac74a5f72f2d3b -size 371625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..557a285e30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f068e6ad63a6ea2109e7e05c4c36cd20b79ac67ee3b03ef3832d7879a7fde21 +size 440379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e8c781d306 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d055f07729c7bd18cb2d5fbc25a3dd049b450238cd59848b5be7db08c389796 +size 453233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..96950c6e4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc9873b7adc517523652fa1f4cdb998f9404183ae77d588fd9525c5008f2253f +size 364553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..99854989dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d5a95d9332c3b9c762280c29bf1e278027c5b942703a2132149616e832bb732 +size 368677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1bfd98e644 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d88a72ef2277944759af36b19dc5271135c0f3fab8df75789760b07772d078 +size 454393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..15b9c3d1b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2054e688f44e03ba8ebfda66e1309a0f1aa8cc8f9830a7bf5bb75356493377fd +size 468037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fa2060badd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07505bfa15eb1aae4e375b9b11f893637661fbee0de32723fcff56862c2ce51 +size 377779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fc8055564a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2add633f814a48e0b0d09ccaeb7c662efce016e4fb1140e7f13755ab6965b254 +size 382691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..93ccd38c4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66894f7268b6a1654a6674ebfd396faf69eaa59e8fc464ca30fb78c5a01e07b8 +size 444325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7929c64830 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87888466934ccb434ba985624bd08b1b057b2c28ffe4e803a88aa3685687c6f2 +size 478541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d6c96a9993 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8e08ad75bd8c8c8137e73a61ba5370d70c57f87d2fc1fe1768d28b6e5a0716 +size 368501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5e1439d6d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9357f05827020608dcf3b6ffc7071e6b50503e548748a87021f0e1558ec578 +size 389199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..54d9952d37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c4c3490d3744788c6028a0a4dc8563042027a4a94c3c5e8a944df1ec4cbde2c +size 458341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..33cb05e734 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d51b50263031f8d27635c253c74fadf4b87a5784c138d277dc42189441bea84 +size 492557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..356543d472 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec5d0e19251c3dc75be7d90c43b1f9f12dd840268bb9426cb24d8d4fbf75eb23 +size 381725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..36e251f061 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a052be40a04029696dd27332c2829508d3fc0a0f4167aae324aaa7e25fe6f57c +size 403213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9827ffeeb4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6606834f0b0bc7a8e65358a2b1fb10e4b3aa465b191eabb52704d91865ce86f0 +size 452219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d754acd378 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68264dc7bc8ae453290039c1c41a4dae1c91d4df9bd9ded2ab2e24797d9de68 +size 526519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..555ae7d385 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:962d0c465373b435425f8c9f4286f4b1370154b1d0f264b34181b1a9232dc45a +size 376393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..785fe5395a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:106a463a15e275fe0bff76b830825125e29d496eaf4b2d0d54f699d16bca9858 +size 430245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7bfcc86284 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc2acdf41e4a07184924bf520630b2a044b48ff7ed103b35997ec920bdcd740a +size 466233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..55423abfc1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a76ebba3074d80d79865d560f31e08df829333d005b5db4c13149c2926f269b +size 542507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..eacb26ea92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a48bcd44185631f860773de5c5e31a117cd4178c3f99b662f64304e8d46417cf +size 389619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e96b59357a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dc86c2de1aa6274c7e52c0fec3a00d0c1b1a25406c4720a82118d4826ae270d +size 443469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..70f32949c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eda7f848aef2aa68cc45668691ce206eeba7ddab1be88814b4d7b03cfabcc1f +size 437215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bbc60f55a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:352ae2da1030b01a5f54710b11d744a0a696792ce1615f02be9578ec3d34ecdd +size 451797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a7bfc36e88 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a643fb451c31d611c9a6c1b1e326961672aa54d5728f1ad9fd2b50e86f347716 +size 361391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..b3f8c9525b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dce82ff5857270495c52b1860ef51203ba2f410a1daf94899ef4efce2ce53b1 +size 375971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ec50b622bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebeff38f62b494443a61c772ac57710174a53448e0b1f1f7787582399f05370e +size 452809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e4212acab2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a6dda136f900982bd99cbcd348fdbaef6be1fe66dd38131d4bcdc6cf46a0af +size 466601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f7c77cc50d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af62e1bcf966c67c1c959e196f5bcf464ae07894825435f64000d3afa582688e +size 375405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a170ce50f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0edca2231b04bd341d7de89767ca65c5353c911175ae01d1dc9e9ae53a685c2 +size 389197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9d5cb7b51a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce5a88cb21e8bf751d4322ce6a8626151d92f3dab343f080f300ce1469756f64 +size 599257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..faa801ed23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fbb97d0d0997c6165b8552d09bcef5d37a33f2c9267a842f47b3228f82ca29d +size 487567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f609958e96 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69538132260bf7b22730b181aa1623ad92c50608322e5626f608574620b4eef2 +size 626544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..72f42209dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d7298abe166b4d077a9e17b859284be9533bf9e8f0594d9ff755468f907ca57 +size 509425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6114fbea53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40d9cfc7c21d9685dd55dfac553378d505d2bde670a28ac55f9ef1b89e26e28f +size 606115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..38138546d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb8e07b528fa7d8ca978356ca3740492e4e46fb6213f0d6acb19697efabb89ce +size 494473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7cfa705990 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:901214d01891087f4c05244b90afbcf782ea021542ed9bc304193067644289a5 +size 633302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..850d9a110a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d5239056d786cd5e66dfd8baf2fcb9638c13f18128fd01148bebc730167fe6 +size 516283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bb5916b40d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:318b669ab3be36be0d2fdd3a51e8e8c8c2ef076234c6d40eb77989b1c70f5a0a +size 612133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..85f9ef3dad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:451499643cf1d4e6b2ffebd089ed57da6b26a10d16edd4972044112c0275ea02 +size 513171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d25bef6850 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c74a3938ffddaf8f5ed51eb8d093b2c29527e1aa8badb00fe1bc582bd86b183a +size 639420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bce9f94223 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beb3e8e4357132fa2a02b7172009e3b3ce90d787c0f887775a044ff70fea1fc4 +size 534981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..eee8ce8654 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e11fc75dcfd2c1ef04c5e3ea70a86735cf5b78a527d944c67b8c707b7743fe +size 595355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c06058f35b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05388691b4ab5d988f33bafb4fc6e7bc86328ae31ea7e531a5a7a018cdbf3b10 +size 483615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3546119ffc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d8748b1eaecbd7c3d5ad28aa154b5e3eb6f3ad7d529613ba02b4b38f13533b +size 622196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fac60890c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06eeb969793ce336c34ab7cbcb9916393e92b1c0f050592ca484005057945c95 +size 504635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2463520ff2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:123a9c687f3382a2b0618f93f1b78bba604e2505bc6867d3a84ff9f529a17d45 +size 569701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d29e25303d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2990de9bea6bb863c4598058bf637d1f4aa64a04c591728cd8dcc28109cd740e +size 466643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d1e4e3e890 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:109acf0eccc987f509533880c761200b1b3b01cd5146684c2c1ffc4e2d609091 +size 586231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1a29fe156b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa2e07a2fd33bb0e60e033f92633d69623a21e729580f5a1ca11585f61ab80c1 +size 497481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0e9bbe3ac7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f3eb292456b63f848dd879bf9c5d9b16f38a6f512fb0cb0be5f77327240974f +size 735252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9eb3ade3a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70228828395e994848a25ee14bbd630a82a69101466e90209d081b4c42c30a5e +size 571613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..54ed340816 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e7d859070a4c9f6023b00f9126d3db2be9609185f381bd4208463017d43bbcc +size 738460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..26ebd93b6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21638849749f060e6937439fbc8a88372a191796db8f476acb5cbf3562f15ddb +size 586511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d1551bb801 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22c10a172ef6dc1c0ef501a615ded83c40ab5dc1474cb120488957d5aecb9ce +size 761600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..71ebf61f7b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6706aba68ded04a42c1c2bac7f8327b53d314e1883bcaee9c4406c937025dc6d +size 597813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ee0cf5c1bd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297d17f5f56d3d1d11563a153acfe8a279c167b82fa5492f7b9503f583ec1e52 +size 763968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..22ede82549 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e5ee036fd1578ba6d5f965c102b0e2612798b07cdad49cc27af3e3027e3fbb +size 614339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..44d3e5c8f0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338ee646ce3dd318c9e3a6c19f354010d8578a847dbeae75a5ad9d19fe27498f +size 738410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f7c798eccc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c572f4a2d085006db5c24324c8cae281868a787a054d4adad1655c7c8599c321 +size 575165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f8570b2aaa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b17bc50daed1edfc2b669ccbea3144d9da2d4b0424d6a9130db1124045aa5e0 +size 740828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5700f473dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5050b9cfe30f375fa65c2678ef1f31b837cc536c0351ac4b372dfe28969878c2 +size 590113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d9e3ebae1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b275928774fe06234eee3b65d9e6182501a3f9078c2880389a24d2aebec15008 +size 764660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..b75b836e09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebe557e59caa49a4d79201b573249cf87305ca5770594e06a7a45d97fba49b0a +size 601415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..db5c527294 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:980e7ae3a7f54534c2f163bc261374b1a2dddfa12c5cf66c4d45bfa43545d1b7 +size 767076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..b68465ab58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:451ba59845b37dab465508c70442725fde40f768fde7b34c9a188634b505d43f +size 617892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..98d47a61b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83caae6c28ba630688941135116d0c1f9ff51437ba519ac309cc5683ba216e8e +size 748326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..285d7e041f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a921dd2cdd92672876cb0d59a298eb5ef08d10b6156ec237220afee81c5719c +size 584933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a7d0e31e5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a507315d9c8db8d1fb77929321184333bb76aa99746bdaa6512536ae588cd763 +size 751532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ddbfc20fdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5decabf6fc2a50893aad9a834349fcc31fb8a216c644dc2e458012fa6cb55c1 +size 599831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..365005d9fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4710bdb0596b29874a78ae420a8acaaa5b57f822904d73cb41dba34e2beb1bbc +size 774624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ccf9a91e69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad3f7d9a3768ebd6670b3b99cdcca23f679315fc846f632522d41797d8b450d5 +size 611133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..af9a2784c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5903dbf2b451921982c0df8bc5370acbbf179d53b94bb9cc14d4faf027f7e97a +size 776992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4a4acf044b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ca8b6ff340611e959377732315416514f7de54ac94efc948731ffc4bf4c38ce +size 626872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f28bd37c7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f36955c479b06039d684f318d7d3e23a3827bee3863e78a7329f47455380da4 +size 734606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2f468d7c4d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2316db4b8f3095be7e9bd95a6799a5b3af07ea093071d3a3139d5eef741dfb +size 583743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..70e350f1eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e96d80be77cad1b3f5d84933f8a97e47ca4fe6e9e27fd5697e298e648ae89e0 +size 743238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..58ce60c5ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f921f153f38254a8e650326e862b95c4df5c902d579b1a4f8b3d49ad1011463b +size 583101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fd6c79daf0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2ea96239248cc81f9be66cc651ed09aab5ddd59f346cce8817dffc143ffcd4a +size 749998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..646d6d5640 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a94244eadf0ce39eb3ad3235bf3f876accb38c5189c33f0fdd9f0f25db5af18 +size 588233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bf49fb5358 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c649b0581e74ba7fd18283ccdc90052db61c31d6d6e7752acd71a29a1320f86 +size 791700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..46dc6b38b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e17b2320cd1dcd9cc5d7115218e40e26ee2d0f19728f9f3871a6a4bd610befb +size 619182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c8dc76f503 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eba99dc141122cd2b2344e4bd749398e38be5a9d98bcab0166538972486ded6 +size 801122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..650b29a338 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2416e1ddc10cab6ea845224b5f890e56148829f7290cfaa5da4b1f431f94fa +size 620514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..63ce147ede --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1e1c1324068250c787279b0960d1198b6d6657c7641a3259b6312c1be8c416 +size 807042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e9b47e2cdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee79bb3e73c11de3ab92454b014292a76e098308cbd0b41f38fb8ee4f95931df +size 625644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..503e57881c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64baf25347d0e5262575f2618b582ece5929d4aa810ed7da37d306523099ef7 +size 760114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fb7dd13df8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7282912932a0a069aab1966c6f30e2ead46d760ffabf6c964fb156d218dd3fcb +size 610733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..10273fe90a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098e7986bda07909e0e492c35f4c2af308dc9af44240c1e4de12d16c7114d84a +size 769538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9efca9321a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8450239859dbcfaf689004509accf1fca36c8daaf4ec94bf9582b30472bc4bd8 +size 609301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..493e70baa0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b5c131e65fe0ff8f49b7a5dad88aeeb6560a4c8a2f006cffc67bb1194e3bb93 +size 775506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..907de7b9c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8cbc101b262e991f7b57abc063612a0f9e86aa5cb846cbc9c3cd23bf5ff2f1e +size 616061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..181178b95e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:512caa9b7c132962a8c6a907645efed54f1038be9d444dfe4667777744599cae +size 817160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a8ac86fa62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe920a11433864cab6ff6975e343fff484d9d12fac489f83916749a0a7f9ddbf +size 646170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d6d53f931c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:747cff4edb2871c01aeeecd4d8f46b1761329d03a46b17ad156537251374a1bf +size 826582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3dc3dc70c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23a7e5ba964fb4f33b0c27445107a0e1fddc680adcc0c617cafcbe4472449384 +size 645974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fffc384cd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0c034b77c1ac36d3adcbf7813f2838ab7dbf66ca66ff94d0722f6f82d3faffc +size 831714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..74a174a0af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:599cd87a3e99527d6eafd60b2e21a1a9ad35628ec64878188ea12bf53babb232 +size 651894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0eef71bd66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:919e7bf17798e7523b9b0a3e0638bd11c9d0b4b9a3f5e3ed581957308d49a0a4 +size 668696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5097a6165d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f130914f3351d371a53bc40ea13eeb5120ac487fe0ad9bf9579fecad4dbec2 +size 560113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..73efd02436 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:629af5849f363cb6da0fa3c889f106aa341323acd166ea5dce9ddc6ff35278f8 +size 720906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..64dd3e4e12 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51094182776057994a68986291de2659f5051c1fc63abbb02d7d9f8f1f08d919 +size 594119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cb39ec1035 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1feaf9b0964dd568f04990a72748982029950da289ec8f5cb99ddb4d2a780ad1 +size 697412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ac3e1d6d0c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22fa6763145adb1f624bc462ec4eb50be819fe3b44f99299cdaa4a47d197457f +size 593367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..df202615ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:032331672eda33d886553bb87e179c3a2874f900f7a9f43759eb69cec857f1bb +size 748882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e14b56549f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18dae6631cc77e8d99a4739343da0ec6e6e13774d9d182616804ab495ebe5877 +size 626586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..879f3aed3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20e6a1d7e40530f956c3a7629667a64a2baa3ab83dc511ac688207d7388ae45a +size 742604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..aeaf7ff4e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f55f9596c2eaa28e18f70709c4ee5f3429dcd03c4a199cf23c4e4462270f37f4 +size 645220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 35e8c484d3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7fda865674570a1fd46c6a1819c2142b2e56cd64d0495c268930cd85265d223 -size 730664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8fa8b26fc2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:49e9f3da47698eb56eba8de6a47b68468a741a56c956208b6ac5d83e466d643b -size 629976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9907206347 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:812d7fb21589408e255434329dcc71b879bb6f65cf5e3fec7cb2fa80cc015676 +size 766584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4a8424e786 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ea260065ce5045619217096cf4dc46af8a5963502ad08b140e260431203b157 +size 670482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index bc0df6954f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b14877b81b30776f826b7a2614bdb29833f56e6ac84e2fa29744316e83172d0d -size 763722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index aa48341042..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff57f1df32fd6a8e27822c6a61941a2bb058b93d961b031ad603f9ce9ba605ed -size 654152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6c17f2c829 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff9fbf0db93905b4312b33fe7ea62cd333762c61f654ea6937f4442c865a4efb +size 765494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ad6c73e5be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9122ad3f78bfc50d7d94699b3f1a1924d31255c3500d5775de698bfdfc777473 +size 680394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 52b1f95ca5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:117af8887a3e305a1b5dd133fa56c79e7c2a65a21cb99baf779f369539865582 -size 747388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index f252d655f6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:862a03ca3ffdd48548a2ef0cc6d222900768b00a70d2b36579880c5fe6a3a383 -size 657996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9e5650cf3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c68df8ca5e229a5c7bedb98434d31275428c9889f70889153afb1ac3e3eef504 +size 790954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..842fa2574e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a0fe24c00726afa84b6a1a76afcf9c5d67eb9e5e33cb080bea4c455cf41682 +size 706594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0667d42897..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1261768b637531bea3520e8c2763b981607f330d937034c7b6fa33777ab79d8 -size 782814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index cc19dc3a7d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:817ed44a8b2e61ebf0e0a9336b05dcfcec4e29e8173df95f2a9986cc982c8a0d -size 683062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..10f41e1bbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86f5995763a54e4c9125c5fb8c0629851433cd77e95a3f504f3eb988469d6221 +size 865986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bb55a64cf2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e322589b34aa402149203f60e5cb70ea4741cb07a5da0585649facc023ddf3 +size 766678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index bf5d600b67..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:333e7b6120bd40c68a252d28c06cc66877e3cb0a129b7876437e61207aea2d9f -size 833870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 3121fb0ccc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46335a677f7d15101f872b47ba27f23ef9df7e2a079991b61706ac3bd78a9ee9 -size 730960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1abcf43256 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:988ad0ecd7fc8f233ef8bb1009dfddff4d8ef0b37fbebcb69bb8c8379b0c7b05 +size 886562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..aff67665b0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_s6_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df53267af58f53f40ccdec64d61a3afb934165d62a4b16dc4472df286ccd21d4 +size 788832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4d3e7f2908..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69ec7ee2099a8fd07fdf480c320e3d0d1f49722e759befd8f3a9e04134797c97 -size 854298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..87cea1a750 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40be4825fbaee1ac24867e75d989b48dde45e87e4c87e651ac1e801b0de9ca7a +size 748616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0634a26052 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18e20aea1b0e6d53a4f75cdf579a20ca16ca83c37297e15f199c3eaf3797f120 +size 613245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..e83edbc5ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b7060d387bc705b42466010269b9882a934e16c3ffca5219fa29bb742bee6c +size 565543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 3aa88e676d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3dac191341e0c2ace0a7bb836dab5c086d58939174951d2bde5af74843dda86b -size 738946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index cc029f6877..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4c653abf56693b3b4c24c11242b24f384b628d7820c9b5e1c3001067f8747ee -size 601207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index c58b8c46d7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40f9c39d8f6608456ba13de259673d08f9eb678151d5ff8a716b30c63a4c2a00 -size 548423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a64649702d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6fc175257babd0e885b76012c8f0098ff610eb1b181af9fa8110b6e990ff41 +size 773188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9bec6b7751 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec60173d112757eebb0e8d88227e633d88dceae516203cfceab55a32146b2396 +size 638508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..ab6b44854a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3f84911046e73430b76cf2aa3848b2ce7d7c867ef6160d8069905c80fb681e +size 592433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index fe4d02ba43..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bbea5715d6a2b792fd9e9c38653f387a53d71fe412ae446254cebdf4fc50ab8 -size 763222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 60f241ea64..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f9a8b178f6a88f0264ce1dc036afe230dffad101a027113c1e106d0ef195500 -size 626174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 2a15a9413c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f5782e26e596b32e13a8fad28e8fe58b8cd27a3cd1dbec10ad0fcd8c94bb766f -size 573933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6e6c17ba9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19236ce7fa5b015f31b65126651aa47696dfa7b137c22c968c4069cee5b88914 +size 655490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..61b1cd12f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8675a5cb51cdbf9af8aba9d4f06e3fa245a3460ec8a68f80f331236e76e0dee +size 533341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 63edc3f7f3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12ea1127bc6caba48d3c6bcc3a6737d5f62d9690fb9c6f3c3fda376d2d60b141 -size 655590 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 81d51ca5f3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf38ae8b4a37ba781712ecafc76f2d7ea12cf2858c9315be4c17b67300de6532 -size 529197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7201fcdab5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9449a3d28a9459d2eacf739f659c35fe171f75a5eaa3ddb7142383ace0f238f4 +size 683960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2a25c0b4b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_s6_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce46638ffe3ac11de2a4c54ca471c20af14b1643ea2632470152a3c9fa181622 +size 551795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7197910fda..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb1908f9653078b018436b7911aae221359ca4438fb305eed8ab4c94af935ec6 -size 683072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 97e9ec7d93..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:58fe371417dce440f8502cb609083eabe611beb4584cb561fb89c55468794a94 -size 548145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6ff0245587 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b9af14df4f0b460d70110d854a72a21cc37ad3f9b73a95a2ae5dc16339d585 +size 674188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..51e7f96830 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9694fc3c405e757e1a3838c38a781170786935d5cbbc9fe2fee6c2ac118467d7 +size 557169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4889eca703..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6509222d39a698e1411f9a829fede1cf78a0fd924964e1ab4889df5accf24abe -size 675224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ba3e8d95e8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f65079590fd9eb664f40b98542c1612a7b1396852ea5f783c59108b44ff80a51 -size 553813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c42b59827c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcadd6e022a2b3350702e512793ece47852af8980ff481d8f1a3ced56c07c85f +size 702164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cd5aae8cf0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e5372d77136b54318bee661f35fc377f7f34d62d6a0f27843d4d09095d9b270 +size 576413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 5812226dbc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f41a31bf0ea7db5fa45e592c73416998c04e6da558c0b1d7496973d775b9d28f -size 703694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 89c67a92e3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3660a7406d501d1dc0ac7fee6d18c510433a1fac8764f27f8efee84d2a12fdf8 -size 571973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9c124ffc2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77987cf08d3298fa43896c1e2b16ac9588f55de0a51e6946b1f9457c62bae20 +size 753418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..26e1022303 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbac1931b0100e8aaab16d57519c16b64bf1e7684331d6bfd617fed9745795a +size 624608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 95b8365e30..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:02653b7787a6d0c0e31d3b39a356228de6def37091fc08d61b2d95d61dd504a2 -size 756624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0cc0d5622f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a5a1b4800e0147a8819d0712ce767f8a1ed8baaa807e04bec0b04b93cb55334 -size 621254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..268fab593d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:892488225d9155f32f5919ad1af7d2644ca30b9fa4f95227bf7bb2d1152c5023 +size 782626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..91bf44f7fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_s4_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2b7bf1e3bba30d677408bb54cbc39578d20a759adc457e710fead72c1c07c4 +size 643852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 89ddcd85dc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04bc350db6e92b32532591e6a7f4817af6bebda9dfa136a004a7f26e49ca002d -size 787658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 127faf4ba1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:913d6f95d37a62baa493ecf88929cca8c665b7b4052312d70afa0fcba9bf8443 -size 638622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..75944bb10e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ad75deffcd8d456dd9724888a75385956f54558a77fa16b5a1f57094f2cf9de +size 522005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..79f6c7316d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:244be2bb6efea6ffb0de44a16014de2707bf0439f10e58c66efc78a05e07965b +size 635604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2a3840bb23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8585ab6a7ca644c166537fcc9abd8d158762be4b7b07c95f8d58846666c1059e +size 552377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index abcf50c9ad..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc1f829768f394a0c3e3f55344095f585668a09352653769734a5962d558387f -size 512927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index f2f72fb9b3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6d5ee131b5218cee8bc4bb8eaecd7127c8de9311b38e81bde5a51d8b19cd0c8 -size 636492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 66719b426c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76ef5916b9e88b7fd7d58ba9844c804cc572d242365425c639160fd22a32b4c4 -size 549663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..2ee153dd69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa7eaf2a41d5c958e6953c693233908845594b0e0fddc496dd3195bb2c492d81 +size 539819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..342ef56938 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb8ed53090ad271c3e928c31db2051501d6f396e6a23a402fdd196fc762b9837 +size 661606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6cc734b853 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s8_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3997e49ea7ebd1925ad66f2e72db70b566b47d87aaf4f5d594cbc040f91dcc94 +size 571029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 9cae415dd0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62cfcb8ebb10fdf8ede8d85465cc4667a1d1eec3157749ae9b78b8c4c2aad399 -size 530939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 349c94aa3f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c04e8203a73f55ae51acd11cc2180b1a33f107933f6c84cf8a199caf28bbd55 -size 663184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1411ab91ee..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a38882e74f64e637d69c47d8a971625eb669bbbb0a2e7ed8521cda9bac5596c -size 566491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..7c22af6fe5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e05dbc6ca4e1d9187c39644222d24c14c831bd045bff845b2964fcccafc284 +size 352915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ca2ae68148 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5b3f0b27904a8639a215738b6966ea619f05f32c5f266e082c8a93ebd59cb34 +size 375521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2e270fc1b5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:932b1f3765f40aa1751e80e065ca8548f87f4a9d641d962b40b4ded3551bd35e -size 362729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index c5b7be4b91..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c5cad59c87a548f7d7a0a688158c9c86793a09aa44d986eeac49450b2775f48 -size 347391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0dbaad2b38 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75addaa9c008fcdc9149546ab2e27e3f659d2cc76ab485c2e28ead8e0e75b5eb +size 376255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cfb4c349ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d39083ad9387d77328486e136760519a0e8671b4d810e8aafcb4954b0d27b0c +size 400241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ee9a432fc6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45e4bec76b0d5e8e78e8756660a27a89e073b4af4843ff01b9867808c3e93004 -size 385575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 0b45ae94f9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ef760f50f0ec4426fa7bc7ecaa6f07415f68647ad061b6970d472c0f4e3d4b7 -size 370137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6d9dda5a44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41bf3ba3de11437ef7e50e752b4a204e0dd09406334dc4581b873d7963f1cc2f +size 601201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f1b50d17e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229e010b819f373b2a07ac7f30fcecb07a348b7306fb7c8eb9720c237327f816 +size 504311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9c6006e894 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ab595a44c96cc5dcd11a505e8a5170699c17141514499a3c2f161ae4211a531 +size 623802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..871475e74b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db09c51417a1124918f53bae88b08308fe3556f3cb9eed96ec2448ba804d7010 +size 525233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0ee29a7bca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5020b92dbe6f35c028754847e441e98c069033e1ac0af608d07a1e450bb333 +size 608305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..be9994f574 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:854b14a060535075e170a0cccf9fa244d33129cd013f92d4e3e0ba3d38180142 +size 505397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..49c8e0bd1f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e4efddcd16a359af61a8dc800a9684aee0181c388858f8fbdf4edbd9bf0cd6d +size 630856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1ff358db99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d57c3102fc93945b6281577486b274378b40b3ae1df7ab182925903e9ae0979 +size 527157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ed3cc9c14a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30c174c8b448518da813c2e74c2a4679996ece9d13363b717862246c83f01ede +size 666422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e5459008ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:377759eba68f050215ea58ce105fd0711041d269d17e50590504054e57af173f +size 556457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e7e2d0f26d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a5f47c0621e929cb4bee3b77c909d8cf1793bbd5fda01355362e000506debb7 +size 689118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6e4c935977 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4331a4470d5dab85cfcf884ca379fb54fb3f141c069ca3289a5e7ac3d61864f3 +size 577377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..b5e064b119 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d60f45cc68847ea38a4fc10a2e3a493515ed2d2cc65815f2e3c02ce93e1a02ae +size 614269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3a2da152ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1759e6a3d86b960c454b1383c2437c233e096c62783d079a893981e4189272b7 +size 505489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..8985aedef6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ae8737125626e0fb1264983da4bd149ed3fe6be76b65b62a11a293984990a51 +size 637116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fedc0ef9d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c89f6d709bdd3bed241a77d14138fb227caf8a660867f93d9862f1da38a04c74 +size 526459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7ffcab6ddd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f47043eec2801e155912e1be57a6b62ee7b7f526d3e5fcad35f4fc7b66c88b8b +size 573667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5e40e178d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8598b94d0d1c8bc99a3a0f11c42f253fe3dd904921f0e6d76f9874c3e38f2ab +size 482549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0f3c30f3b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64560ac36a2c677ff7d10a96746e42603589292f6e8707f742f56429144b3fd9 +size 600559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f06a9767ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3ee68df7ea0d9578cfff391d8e859fa8a97937e397d9e592f4805824af55e97 +size 512597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..fbfee49845 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fd56002cbdc89f4cf54ce93e2a9ea630dc88dd89a0ccd988ebf9dd6a6315f8 +size 558931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 6da407574c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5789ddc8bece759e54f7e53a06eedd34d24a59cc0e216d2ef8ce21a31ae1a95e -size 541813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..9448465193 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_s4_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1234c02a807fd3324bd86f2faaf028cb41483c5b1f7aec62a2663b48d39ddebd +size 585033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 2a7c966d75..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8425ed46a527766e2b20a59db17be8aa8511a7d03022685c489b3d1d1450118 -size 566533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0a98d5cead --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f4cf45795686e51e219ef8ac98712f573955519b233c723690e2c1913eeb885 +size 508439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index f3d6284577..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b381989464b296ed6bbeb5cc30b83d1e736e9ad521f6b37e3362cf6be4a880f1 -size 500101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..c572b5d80e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_s3_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d5318664344dc563a4f7fc7b9f2b1dcf56579d52f52a60bdf35e8638b90e713 +size 527041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index dabdb1e60a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2916365c16d42aa950ee87cd0e11b8e8a4d0573714a6bf7e6412ca6cf058be25 -size 517323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..165a401537 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59e9083565db309c2b23dec7856548cf904a1c79e4a3d4e36f29fe15062e04c0 +size 352817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 0aecadc579..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fa34a5fd253b0014ddc93425ed5ca30349cc1e88a8f0492dc679f96b065236d0 -size 347291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..036852087f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b86cdb474ce0f703b0106944e3d5ee7bf1ea69c96764dba4737eb378a1fd870 +size 376155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index a95515865a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83d02d90f012c3c2aec1959f38a7607add5c4492be4c300522a122b6dd64a0e5 -size 370039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f017ef13ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aec9660e9761d5b4f0a5a1a7d68fe6f4d2a45c3791d320b808c85def581381c +size 778000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..742efaf926 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06cda32b974c113fa3e52e8a3b5a9d258460ae1c042be859652435d5a69db324 +size 622304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..26f92464c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76effa82e68822ce8ea47093421f173f31e9024827016d934ce9649d0853f5c5 +size 782932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..eb99a94e6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:398ce6c746a75f727ba3bffd6eb3d503764899d87caa17e82b3e66b01f20e42a +size 631282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c60d5b7f91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc6b30e8f692902a1f0ab347cf0b9d024c666ea59823ef2f391aefb876d4555e +size 802670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7c47454f4d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:021948fc1d45646a0b1e89dd9d907fd0510438804ce5e47ee6880b5b515698f2 +size 646728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3ddb5d5496 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1ab2f050f04e5b95395b97f8c875e9c5205d7b9fd4458adad5ce08eb7b103c +size 807604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..de1667dcdf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d1009662e856c1192bc9b6c8f357d29bed33d8835e983cd4fbe0bb6ab517ced +size 656544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..275097ae4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3162e8466b946b9ce7d20cb357cb7bc21a4cc98da4220733655ac5973fc0c83 +size 810708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cbd1a01c7e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c145e6ae86487ffe86cdb436b5ac44df3d6d2ffdc27f7d93b7f10280ce53633e +size 662412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e218d4584a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae9326557d9d097349f9e660d8c40cce5ccf3139ee5bf665365f5a78b1eb84b +size 814900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3c153c1d78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10b4b784a560e6e570580a7f94da3c3185954c9c04e5358a5b4efcd098ce330 +size 669762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..51ba64c407 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816c537e9072ec4feb259dfe450e2b64140d086a29ecfd164c6cdef3758079c3 +size 837598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e91fcbc491 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84ea02fee7c45f6d30d81df0f5923a0a2b4755f41f4cdfaf29531537f57b008f +size 687674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..acd7ca9906 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d7268a7131044bc11fc79963372f580dcc17715c8ff8ee6e608c455cedd6a0 +size 842532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f0e01bfb45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c2e0ae9d404f7f81775047eb956b58a5aa0e6844877c21a9a2f3705e495ef4 +size 695024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..436c57974e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb811d1356be5441b021edd1dc8f51821a5bda70d2a231d81a90eaa930e2eaa3 +size 889000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2aa104d946 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16f5916e1c29922df000c379f64a20d6ad83293eca27db9c43a49fb3556553d +size 734192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..8fceb572b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7538039aaf3cf2ebed55187f997002aafc45baf5a4a3fac5ff3cd0de09189fcc +size 893982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4efa8231f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03cf555fb6740ec8d7a29ee728faab387ee78a8f9030cf388b8b64ef71c356f0 +size 744552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..805b8eea4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd694715247aac0c0f617609b0ea687a801ee1255c547dbe2cf3f402de9279c9 +size 920132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..55b6b99cac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f621ebe7beb00fcd24035462a390bb9c8a3249192129d2d81fb86d70900b0e0 +size 759552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..185eab7e65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b69bec4b6f356dd29e37542298802815998c78c2905d86b2af352a5406eb58a +size 925066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e00d5bd0bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc8098a1c79edfee18d0ea4f811c4361e70cd43f144a968f6d176053be64c1e +size 769962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..06ec82e224 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c585f95abb4d189c23442d7ee9157b3c54c5b60d163870fa1a50badbf785d476 +size 788846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp index b5b23663c8..017302673c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:476a5e853bba875db6b46a78ac89ae65997add5e50370b18e32d46998a68042c -size 752868 +oid sha256:294deeb9540d408c3659573fcff60a4f9a5ec13e4e04295d67b5759ea402626b +size 624418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..afd676a7d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac1a27c7d3a21687718b363fa606077c256b6521b1106ef2333656056d3155de +size 795754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..60dd37e915 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b099eb23550359125f257bcb0761088bff3b69b3a2587920801349d7a82d4ed +size 632066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d7250f8b64 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37b3985344b25daf1625ba0f6ab56f77aeb352d695f4f14e6548af1004d7ccec +size 802462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..864a7b3d61 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa8056dbf890836f0e8c5d8f54f3c5e0a993af714996a9916aff2eac04b5e7aa +size 638232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ace3468aca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:853eb08ce58255d95404ab40a06ed670ba6e8b6cbb10689076efdeab4accaf55 +size 838048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ae5f240a4a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b70e7c824f65cc2ac4fe83dd42bb7089021fdc7a1757f886d75ee7b4e0809a3 +size 654232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..99ef7c3b84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5a0ab1595df3fc1a4e08dc17eb9cb2ad94ba3df2fe448e6c0bd472fac29521 +size 845744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c9d89c6fdb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:131bf2a32e50304bf96ad8a841c419dfadb3c35bcfb1edf09aca5ddb79f91ea1 +size 664690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f243763a25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba5d79a98f8924d30ae803d262bd6916c650b327ab252cf78cf1620b918f372d +size 851664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..77af8e4797 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ace2d66880a3dfa688d554af545dabe7e80b3a87f955d3f96fd9652a75e5042 +size 670808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fa330625be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72b4c5f75b36af19bbfe05463253cf0effec24b5e4310b597949d3a4d1b6597f +size 814110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..105d4f570e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s4_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b83e8f2ec284ed7c9d3107e4773b7664e651e2919fabfdf1808855408cdf513 +size 649090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f1f1b92d35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a1ba02d2f8910dc2751a0073bcd8e814e427a6eca951584c33b576ba3e269e9 +size 821806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d17a7a5192 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s5_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:145d60a5999da63c32f571b775f7803bb6b142562bf58092beb02ac4c434ecbd +size 657328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a034dd7088 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:804129ff75a28096c9e6dfbfd1a34e7e2a4bd13398cec0a35f2a2360f8fd97bf +size 827726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4ea7426a77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s6_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1623708b6eb3d7195cec9075f4e0a6ec4a81c07ae4697299a7d245fe55bbee07 +size 663446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..21f83bd479 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f7ba8665ba1d4fc7b877b45bb4d29faf470ab14d3be13c4e672e7cea80974bc +size 864198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..efd394b462 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s4_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf14975fa89384752b40f4cffdc1a0e01ca788e401e649d39b4a5d0c6e686600 +size 679494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..554eed17fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:794da89437bc1fe4762f592d014853d72c7903ee75b35a9946c3da7677443cf2 +size 871106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4eeb31deb1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s5_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cb43ef9c01e95517e7a0943d791b7a00b9806c95358101624c76c4f68e1ff92 +size 689954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f34e301159 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc95dffa6e796b88304a0ba270f98fc0ea0d6f1d32632f0056c463ca0ce36f60 +size 877074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..8758e75d62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s6_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ded1d71090eb1aca9383e60d723886dbdb2c48f238deb342f534a87a177ac510 +size 696070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..83d4fb53d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46f1273a18a684cc78e7c7d6abe572552470f39b2b5c68d0cc06e11743d857c8 +size 713416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9e671ceedb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:721afd2780de412e595ad8eeedb8e2d55b8aef9f7e8f0e18e44ba5b0f074aed8 +size 612183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..d961954ecf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dfbc98cf787c9d550eb18a862a652578f88af189603be6e9821a6464288e372 +size 763654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..38892e4de1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0082eec5dad1402d0ff4ee616b27740a8f2e0a4dc720f363102b95f0d63ea6cd +size 639186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a1aed4189d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3442cf46de7444f5b86da473cb81211c6a3acbf7a7e46e5614f322b2920d8e1 +size 742526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c2cd76c89c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_s3_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a41b129268bc0adc71c966870fd74fdd8b27a9a9662aaae3c703777743a827df +size 646474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4a427d9f88 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:631ac2b1f60c0c22f624ad78fb46cc2b3a94e9d7c41410fcdfe491c8dc966ab2 +size 792270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4e6e15950a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x2_16dp256b_s3_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5323f2aabee9df5681dc3d54f5c11c7ec5eb64cf58ef230a7cc81021c2204c78 +size 672440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu index ba850c45a2..65cd5f3c59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu @@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params) float constexpr E4m3MaxVal{448.f}; // Compute the absolute max - float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum()); + float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max()); if (threadIdx.x == 0) { s_scaleOut = aMax / E4m3MaxVal; @@ -516,11 +516,11 @@ __global__ void finalizeKernel(KernelParams params) if (params.expertWeightsPtr != nullptr) { TypeExpW const scale = params.expertWeightsPtr[expandedIdx]; - data += float{scale} * float{params.inPtr[permutedIdx * params.hiddenDim + hiddenIdx]}; + data += float{scale} * float{params.inPtr[permutedIdx * params.hiddenDimPadded + hiddenIdx]}; } else { - data += float{params.inPtr[permutedIdx * params.hiddenDim + hiddenIdx]}; + data += float{params.inPtr[permutedIdx * params.hiddenDimPadded + hiddenIdx]}; } } @@ -549,7 +549,9 @@ __global__ void finalizeKernelVecLoad(KernelParams params) using Type = typename KernelParams::Type; using TypeExpW = typename KernelParams::TypeExpW; + int const hiddenDimPaddedBits = params.hiddenDimPadded * cutlass::sizeof_bits::value; int const hiddenDimBits = params.hiddenDim * cutlass::sizeof_bits::value; + assert(hiddenDimPaddedBits % 128 == 0); assert(hiddenDimBits % 128 == 0); // Load 128-bits per thread, according to the smallest data type we read/write @@ -561,6 +563,7 @@ __global__ void finalizeKernelVecLoad(KernelParams params) int64_t const tokenIdx = blockIdx.x; int64_t const startOffset = threadIdx.x; int64_t const stride = FINALIZE_THREADS_PER_BLOCK; + int64_t const numElemsInPaddedCol = params.hiddenDimPadded / FINALIZE_ELEM_PER_THREAD; int64_t const numElemsInCol = params.hiddenDim / FINALIZE_ELEM_PER_THREAD; auto const offset = tokenIdx * params.hiddenDim; @@ -592,7 +595,7 @@ __global__ void finalizeKernelVecLoad(KernelParams params) float const scale = (params.expertWeightsPtr != nullptr) ? static_cast(params.expertWeightsPtr[expandedIdx]) : 1.f; - auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInCol; + auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInPaddedCol; float4 input = vectorizedLoadPtx(reinterpret_cast(&inputPermutedPtr[elemIndex])); InputElem inputPermutedElem = *reinterpret_cast(&input); @@ -650,14 +653,14 @@ __global__ void finalizeDeepSeekKernel(KernelParams params) float const expertProb = (float) params.expertWeightsPtr[tokenIdx * params.topK + k]; float const scale = expertProb * blockScale; - acc += scale * static_cast(params.inPtr[permutedIdx * params.hiddenDim + hiddenIdx]); + acc += scale * static_cast(params.inPtr[permutedIdx * params.hiddenDimPadded + hiddenIdx]); } // The largest (finite) value that can be represented using E4m3. float constexpr E4m3MaxVal{448.f}; // Compute the absolute max - float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum()); + float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max()); if (threadIdx.x == 0) { diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h index 797aeadcb7..ea2a569777 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.h @@ -395,7 +395,10 @@ struct Data int32_t numTokens; int32_t numExperts; int32_t topK; + // Hidden dimension output of MoE block. It is not padded. int32_t hiddenDim; + // Hidden dimension output of FC2. It might be padded. + int32_t hiddenDimPadded; int32_t const* totalNumPaddedTokens; }; @@ -416,6 +419,7 @@ struct KernelParams int32_t* expandedIdxToPermutedIdx; int32_t hiddenDim; + int32_t hiddenDimPadded; int32_t numTokens; int32_t numExperts; int32_t topK; @@ -434,6 +438,7 @@ struct KernelParams params.expandedIdxToPermutedIdx = data.expandedIdxToPermutedIdx; params.hiddenDim = data.hiddenDim; + params.hiddenDimPadded = data.hiddenDimPadded; params.numTokens = data.numTokens; params.numExperts = data.numExperts; params.topK = data.topK; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu index 939dc708d6..f03e02c2e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu @@ -27,6 +27,8 @@ static constexpr int MaxNumTopExperts = 8; static constexpr int MaxNumExperts = 128; static constexpr int MaxNumTokensSingleCluster = NumBlocksPerCluster * NumThreads; static constexpr int MaxNumTokensSingleClusterScores = NumBlocksPerCluster * NumWarps; +static constexpr int NumThreadsSingleBlock = MaxNumExperts; +static constexpr int BlockKernelMaxNumTokens = 4; template __forceinline__ __device__ void routingTopKExperts(cg::thread_block_tile const& warp, @@ -75,6 +77,156 @@ __forceinline__ __device__ void routingTopKExperts(cg::thread_block_tile +__global__ void __launch_bounds__(NumThreadsSingleBlock) routingIndicesBlockKernel(KernelParams params) +{ + // types used in this kernel + using OutputT = typename KernelParams::OutputT; + using InputT = typename KernelParams::InputT; + using BaseType = std::conditional_t; + using TypePacked = PackedScoreIdx; + + int32_t const warpIdx = __shfl_sync(0xffffffff, threadIdx.x / WarpSize, 0); + int32_t const laneIdx = cutlass::arch::LaneId(); + int32_t const expert = threadIdx.x; + auto scoreOffset = warpIdx * params.mNumExperts; + bool validToken = warpIdx < params.mNumTokens; + + static constexpr int VecSize = MaxNumExperts / WarpSize; + static constexpr int totalExpertCounts = BlockKernelMaxNumTokens * MaxNumExperts; + __shared__ int8_t __attribute((aligned(128))) smemOffset[totalExpertCounts]; + __shared__ int8_t __attribute((aligned(128))) smemKIdx[totalExpertCounts]; + + using Scan = cub::BlockScan; + __shared__ typename Scan::TempStorage tempStorage; + + auto block = cg::this_thread_block(); + auto warp = cg::tiled_partition(block); + + for (int i = threadIdx.x; i < totalExpertCounts; i += blockDim.x) + { + smemOffset[i] = int8_t{-1}; + smemKIdx[i] = int8_t{-1}; + } + __syncthreads(); + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + // then wait on primary grid + if constexpr (KernelParams::UsePdl) + { + cudaGridDependencySynchronize(); + } +#endif + + if (params.mPtrScores != nullptr) + { + // in this case, each warp represents a token + BaseType score[VecSize]; + int32_t idx[VecSize]; + + BaseType warpTopKScore[MaxNumTopExperts]; + int32_t warpTopKExpertIdx[MaxNumTopExperts]; + + BaseType minScore = BaseType{-INFINITY}; + if (validToken) + { + routingTopKExperts(warp, score, idx, + warpTopKScore, warpTopKExpertIdx, laneIdx, params.mNumExperts, params.mTopK, + params.mPtrScores + scoreOffset, params.mNormTopkProb); + + if (laneIdx < params.mTopK) + { + int offset = warpIdx * MaxNumExperts + warpTopKExpertIdx[laneIdx]; + smemKIdx[offset] = static_cast(laneIdx); + if (params.mPtrExpertWeights != nullptr) + { + params.mPtrExpertWeights[warpIdx * params.mTopK + laneIdx] = OutputT{warpTopKScore[laneIdx]}; + } + } + } // end if (validToken) + } + __syncthreads(); + + // set local experts + auto localExpertIdx = expert - params.mLocalExpertsStartIdx; + auto isLocalExpert = localExpertIdx >= 0 && localExpertIdx < params.mNumLocalExperts + && (localExpertIdx & params.mLocalExpertsStrideLog2) == 0; + // Get the count of each expert and the offset for each token + int accExpertCount = 0; + + if (isLocalExpert) + { + int offset = expert; + for (int j = 0; j < BlockKernelMaxNumTokens; j++) + { + if (smemKIdx[offset] >= 0) + { + smemOffset[offset] = static_cast(accExpertCount); + accExpertCount++; + } + offset += MaxNumExperts; + } + } + __syncthreads(); + // Get the number of CTAs and the offset for each CTA + const int32_t numCta = divUpLog2(accExpertCount, params.mPaddingLog2); + int32_t ctaOffset = 0; + int32_t numNonExitingCtas; + Scan(tempStorage).ExclusiveSum(numCta, ctaOffset, numNonExitingCtas); + + int32_t expertScanCounts = 0; + Scan(tempStorage).ExclusiveSum(divUpMulLog2(accExpertCount, params.mPaddingLog2), expertScanCounts); + __syncthreads(); + + if (isLocalExpert) + { + for (int cta = 0; cta < numCta; ++cta) + { + const int32_t localExpertIdx = (expert - params.mLocalExpertsStartIdx) >> params.mLocalExpertsStrideLog2; + params.mPtrCtaIdxXyToBatchIdx[ctaOffset + cta] = localExpertIdx; + params.mPtrCtaIdxXyToMnLimit[ctaOffset + cta] + = min(mulLog2(ctaOffset + cta + 1, params.mPaddingLog2), + mulLog2(ctaOffset, params.mPaddingLog2) + accExpertCount); + } + } + + // at this point, we can write out padded count + if (threadIdx.x == 0) + { + const int32_t permutedIdxSize = mulLog2(numNonExitingCtas, params.mPaddingLog2); + params.mPtrPermutedIdxSize[0] = permutedIdxSize; + params.mPtrNumNonExitingCtas[0] = numNonExitingCtas; + } + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) +#if !defined(PDL_PROFILE) || PDL_PROFILE == 0 + // we can trigger the next kernel at this point + if constexpr (KernelParams::UsePdl) + { + cudaTriggerProgrammaticLaunchCompletion(); + } +#endif +#endif + + for (int tokenIdx = 0; tokenIdx < params.mNumTokens; tokenIdx++) + { + int offset = tokenIdx * MaxNumExperts + threadIdx.x; + if (smemKIdx[offset] >= 0) + { + int const expandedIdx = tokenIdx * params.mTopK + smemKIdx[offset]; + int const offsetWithinExpert = static_cast(smemOffset[offset]); + int const offsetForExpert = expertScanCounts; + int const permutedIdx = isLocalExpert ? offsetForExpert + offsetWithinExpert : int32_t{-1}; + + params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx; + if (isLocalExpert) + { + params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx; + } + } + } +} + template #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) __global__ void __cluster_dims__(NumBlocksPerCluster, 1, 1) __launch_bounds__(NumThreads) @@ -234,10 +386,11 @@ void run(Data const& data, void* stream) data.mNumExperts % 4 == 0, "Routing kernel expects #experts %d to be a multiple of 4.", data.mNumExperts); TLLM_CHECK_WITH_INFO(data.mPaddingLog2 < 8, "Routing kernel expects padding log2 < 8, got %d", data.mPaddingLog2); + bool const useSingleBlock = data.mNumTokens <= BlockKernelMaxNumTokens; bool const useSingleCluster = data.mNumTokens <= (data.mPtrScores != nullptr ? MaxNumTokensSingleClusterScores : MaxNumTokensSingleCluster); - if (!useSingleCluster) + if (!useSingleCluster && !useSingleBlock) { TLLM_CHECK_WITH_INFO( data.mPtrExpertIdx != nullptr, "When #tokens is large, `mPtrExpertIdx` is a required input."); @@ -245,7 +398,15 @@ void run(Data const& data, void* stream) data.mPtrExpertCounts != nullptr, "When #tokens is large, `mPtrExpertCounts` is a required input."); } - if (useSingleCluster) + if (useSingleBlock) + { + //@TODO: For now we use the single block kernel for cases with token number no larger than 4. + // We will future tune this threshold based on the performance. + LAUNCH_ROUTING_WITH_EXTRA_FLAG(data, false, routingIndicesBlockKernel, 1, NumThreadsSingleBlock, + /*smemSize=*/0, // No dynamic smem + stream, data.mDoSoftmaxBeforeTopK, /*forceFloatInput=*/false); + } + else if (useSingleCluster) { LAUNCH_ROUTING_WITH_EXTRA_FLAG(data, false, routingIndicesClusterKernel, NumBlocksPerCluster, NumThreads, /*smemSize=*/0, // No dynamic smem diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu index b8064c7c12..1d267a2955 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu @@ -202,39 +202,46 @@ namespace PermuteGemm1 { tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions( - btg::Dtype dtypeElt, int32_t tileTokensDim, bool useDeepSeekFp8) + btg::Dtype dtypeAct, btg::Dtype dtypeWeights, int32_t tileTokensDim, bool useDeepSeekFp8, ActType actType) { - tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {.eltType = dtypeElt, - .outputType = dtypeElt, - .deepSeekFp8 = useDeepSeekFp8, - .fusedAct = !useDeepSeekFp8, - .routeAct = true, - .staticBatch = false, - .transposeMmaOutput = true, - .tileSize = tileTokensDim, - .epilogueTileM = useDeepSeekFp8 ? 64 : 128}; + tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options + = {// Swap A and B dtypes because transposeMmaOutput is hardcoded to true + .dtypeA = dtypeWeights, + .dtypeB = dtypeAct, + .dtypeC = dtypeAct, + .actType = actType, + .deepSeekFp8 = useDeepSeekFp8, + .fusedAct = !useDeepSeekFp8, + .routeAct = true, + .staticBatch = false, + .transposeMmaOutput = true, + .tileSize = tileTokensDim, + .epilogueTileM = useDeepSeekFp8 ? 64 : 128}; return options; } -Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim) - : mDtypeElt(dtypeElt) +Runner::Runner(btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8, int tileTokensDim, ActType actType) + : mDtypeAct(dtypeAct) + , mDtypeWeights(dtypeWeights) , mTileTokensDim(tileTokensDim) - , mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner(getOptions(mDtypeElt, mTileTokensDim, useDeepSeekFp8))) + , mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner( + getOptions(mDtypeAct, mDtypeWeights, mTileTokensDim, useDeepSeekFp8, actType))) { } void Runner::run(void* hiddenState, void* hiddenStateScale, void* weights, void* weightsScale, void* expertWeights, - float* outputScalesScalar, float* outputScalesGateScalar, void* output, void* outputScale, int32_t topK, - int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* permutedIdxToTokenIdx, - int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, - int32_t* ptrCtaIdxXyToMnLimit, void* bmm1Workspace, bool useRoutingScalesOnInput, int device, cudaStream_t stream, - int32_t configIndex) + float* outputScalesScalar, float* outputScalesGateScalar, float* ptrBias, float* ptrAlpha, float* ptrBeta, + float* ptrClampLimit, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, int32_t intermediateSize, + int32_t numExperts, int32_t numTokens, int32_t* permutedIdxToTokenIdx, int32_t* ptrNumNonExitingCtas, + int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit, + void* bmm1Workspace, bool useRoutingScalesOnInput, int device, cudaStream_t stream, int32_t configIndex) { auto maxNumCtasInBatchDim = Routing::getMaxNumCtasInBatchDim(numTokens, topK, numExperts, mTileTokensDim); mRunner.run(numTokens, 2 * intermediateSize, hiddenSize, {}, numTokens, numExperts, maxNumCtasInBatchDim, hiddenState, hiddenStateScale, weights, weightsScale, expertWeights, /* perTokensSfB */ nullptr, - outputScalesScalar, outputScalesGateScalar, output, outputScale, permutedIdxToTokenIdx, ptrTotalNumPaddedTokens, - ptrCtaIdxXyToBatchIdx, ptrCtaIdxXyToMnLimit, ptrNumNonExitingCtas, bmm1Workspace, stream, device, configIndex); + outputScalesScalar, outputScalesGateScalar, ptrBias, ptrAlpha, ptrBeta, ptrClampLimit, output, outputScale, + permutedIdxToTokenIdx, ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx, ptrCtaIdxXyToMnLimit, + ptrNumNonExitingCtas, bmm1Workspace, stream, device, configIndex); } size_t Runner::getWorkspaceSizeInBytes(int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, @@ -274,31 +281,36 @@ std::vector Runner::getPassingConfigIndices() const namespace Gemm2 { tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions getOptions( - btg::Dtype dtypeElt, btg::Dtype dtypeOut, int32_t tileTokensDim, bool useDeepSeekFp8) + btg::Dtype dtypeAct, btg::Dtype dtypeWeights, btg::Dtype dtypeOut, int32_t tileTokensDim, bool useDeepSeekFp8) { - tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options = {.eltType = dtypeElt, - .outputType = dtypeOut, - .deepSeekFp8 = useDeepSeekFp8, - .fusedAct = false, - .routeAct = false, - .staticBatch = false, - .transposeMmaOutput = true, - .tileSize = tileTokensDim, - .epilogueTileM = useDeepSeekFp8 ? 64 : 128}; + tensorrt_llm::kernels::TrtllmGenBatchedGemmRunnerOptions options + = {// Swap A and B dtypes because transposeMmaOutput is hardcoded to true + .dtypeA = dtypeWeights, + .dtypeB = dtypeAct, + .dtypeC = dtypeOut, + .deepSeekFp8 = useDeepSeekFp8, + .fusedAct = false, + .routeAct = false, + .staticBatch = false, + .transposeMmaOutput = true, + .tileSize = tileTokensDim, + .epilogueTileM = useDeepSeekFp8 ? 64 : 128}; return options; } -Runner::Runner(btg::Dtype dtypeElt, btg::Dtype outputDtype, bool useDeepSeekFp8, int tileTokensDim) - : mDtypeElt(dtypeElt) - , mOutputDtype(outputDtype) +Runner::Runner( + btg::Dtype dtypeAct, btg::Dtype dtypeWeights, btg::Dtype dtypeOut, bool useDeepSeekFp8, int tileTokensDim) + : mDtypeAct(dtypeAct) + , mDtypeWeights(dtypeWeights) + , mDtypeOut(dtypeOut) , mTileTokensDim(tileTokensDim) , mRunner(tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner( - getOptions(mDtypeElt, mOutputDtype, mTileTokensDim, useDeepSeekFp8))) + getOptions(dtypeAct, dtypeWeights, dtypeOut, tileTokensDim, useDeepSeekFp8))) { } void Runner::run(void* permutedHiddenState, void* permutedHiddenStateScale, void* weights, void* weightsScale, - float* outputScalesScalar, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, + float* outputScalesScalar, float* ptrBias, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit, void* bmm2Workspace, int device, cudaStream_t stream, int32_t configIndex) @@ -306,7 +318,8 @@ void Runner::run(void* permutedHiddenState, void* permutedHiddenStateScale, void auto maxNumCtasInBatchDim = Routing::getMaxNumCtasInBatchDim(numTokens, topK, numExperts, mTileTokensDim); mRunner.run(numTokens, hiddenSize, intermediateSize, {}, numTokens, numExperts, maxNumCtasInBatchDim, permutedHiddenState, permutedHiddenStateScale, weights, weightsScale, /* perTokensSfA */ nullptr, - /* perTokensSfB */ nullptr, outputScalesScalar, /* outputScalesGateScalar */ nullptr, output, outputScale, + /* perTokensSfB */ nullptr, outputScalesScalar, /* outputScalesGateScalar */ nullptr, ptrBias, + /* ptrAlpha */ nullptr, /* ptrBeta */ nullptr, /* clampLimit */ nullptr, output, outputScale, /* permutedIdxToTokenIdx */ nullptr, ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx, ptrCtaIdxXyToMnLimit, ptrNumNonExitingCtas, bmm2Workspace, stream, device, configIndex); } @@ -348,11 +361,11 @@ std::vector Runner::getPassingConfigIndices() const namespace MoE { -Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim) - : mPermuteGemm1(PermuteGemm1::Runner(dtypeElt, useDeepSeekFp8, tileTokensDim)) - , mGemm2(Gemm2::Runner(dtypeElt, btg::Dtype::Bfloat16, useDeepSeekFp8, tileTokensDim)) +Runner::Runner( + btg::Dtype dtypeAct, btg::Dtype dtypeWeights, bool useDeepSeekFp8, int32_t tileTokensDim, ActType actType) + : mPermuteGemm1(PermuteGemm1::Runner(dtypeAct, dtypeWeights, useDeepSeekFp8, tileTokensDim, actType)) + , mGemm2(Gemm2::Runner(dtypeAct, dtypeWeights, btg::Dtype::Bfloat16, useDeepSeekFp8, tileTokensDim)) { - auto const& gemm1PassingIndices = mPermuteGemm1.getPassingConfigIndices(); auto const& gemm2PassingIndices = mGemm2.getPassingConfigIndices(); @@ -370,6 +383,11 @@ Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim) TLLM_CHECK_WITH_INFO(!mPassingConfigs.empty(), "No compatible configs found for the fp8 block scale MoE runner."); } +Runner::Runner(btg::Dtype dtypeElt, bool useDeepSeekFp8, int32_t tileTokensDim) + : Runner(dtypeElt, dtypeElt, useDeepSeekFp8, tileTokensDim) +{ +} + void Runner::setOpsData(MoERunnerArgs const& args, MoEWorkspace const& workspace, moe::dev::convertsf::Data& convertSfData, moe::dev::activation::Data& activationData, moe::dev::finalize::Data& finalizeData) @@ -421,7 +439,9 @@ void Runner::setOpsData(MoERunnerArgs const& args, MoEWorkspace const& workspace finalizeData.numTokens = args.num_tokens; finalizeData.numExperts = args.num_experts; finalizeData.topK = args.top_k; - finalizeData.hiddenDim = args.hidden_size; + // We want to fuse unpadding into the finalize kernel, so we need to use the output hidden size. + finalizeData.hiddenDim = args.hidden_size_output.value_or(args.hidden_size); + finalizeData.hiddenDimPadded = args.hidden_size; finalizeData.totalNumPaddedTokens = workspace.total_num_padded_tokens; } } @@ -489,11 +509,12 @@ void Runner::run( auto const& config = mPassingConfigs[configIndex]; mPermuteGemm1.run(args.hidden_states, hidden_states_scale_linear, args.gemm1_weights, args.gemm1_weights_scale, - workspace.expert_weights, args.output1_scales_scalar, args.output1_scales_gate_scalar, workspace.gemm1_output, - workspace.gemm1_output_scale, args.top_k, args.hidden_size, args.intermediate_size, args.local_num_experts, - args.num_tokens, workspace.permuted_idx_to_token_idx, workspace.num_non_exiting_ctas, - workspace.total_num_padded_tokens, workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit, - workspace.bmm1_workspace, args.mUseRoutingScalesOnInput, device, stream, config.gemm1Config); + workspace.expert_weights, args.output1_scales_scalar, args.output1_scales_gate_scalar, args.gemm1_bias, + args.gemm1_alpha, args.gemm1_beta, args.gemm1_clamp_limit, workspace.gemm1_output, workspace.gemm1_output_scale, + args.top_k, args.hidden_size, args.intermediate_size, args.local_num_experts, args.num_tokens, + workspace.permuted_idx_to_token_idx, workspace.num_non_exiting_ctas, workspace.total_num_padded_tokens, + workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit, workspace.bmm1_workspace, + args.mUseRoutingScalesOnInput, device, stream, config.gemm1Config); // We do not fuse activation with FC1 for DeepSeek FP8 due to the weights shuffling constraint. void* gemm2_input = workspace.gemm1_output; @@ -509,10 +530,10 @@ void Runner::run( // Run gemm2 mGemm2.run(gemm2_input, gemm2_input_scale, args.gemm2_weights, args.gemm2_weights_scale, args.output2_scales_scalar, - workspace.gemm2_output, workspace.gemm2_output_scale, args.top_k, args.hidden_size, args.intermediate_size, - args.local_num_experts, args.num_tokens, workspace.num_non_exiting_ctas, workspace.total_num_padded_tokens, - workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit, workspace.bmm2_workspace, device, stream, - config.gemm2Config); + args.gemm2_bias, workspace.gemm2_output, workspace.gemm2_output_scale, args.top_k, args.hidden_size, + args.intermediate_size, args.local_num_experts, args.num_tokens, workspace.num_non_exiting_ctas, + workspace.total_num_padded_tokens, workspace.cta_idx_xy_to_batch_idx, workspace.cta_idx_xy_to_mn_limit, + workspace.bmm2_workspace, device, stream, config.gemm2Config); // Run finalize if (args.do_finalize) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h index 08ce5a8916..a1c7113aa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h @@ -120,7 +120,8 @@ namespace PermuteGemm1 class Runner { public: - explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim); + explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeAct, batchedGemm::trtllm::gen::Dtype dtypeWeights, + bool useDeepSeekFp8, int tileTokensDim, ActType actType); size_t getWorkspaceSizeInBytes(int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t configIndex) const; @@ -134,14 +135,16 @@ public: [[nodiscard]] std::vector getPassingConfigIndices() const; void run(void* hiddenState, void* hiddenStateScale, void* weight, void* weightScale, void* expertWeights, - float* outputScalesScalar, float* outputScalesGateScalar, void* output, void* outputScale, int32_t topK, - int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, - int32_t* permutedIdxToTokenIdx, int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, - int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit, void* bmm1Workspace, - bool useRoutingScalesOnInput, int device, cudaStream_t stream, int32_t configIndex); + float* outputScalesScalar, float* outputScalesGateScalar, float* ptrBias, float* ptrSwiGluAlpha, + float* ptrSwiGluBeta, float* ptrClampLimit, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, + int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* permutedIdxToTokenIdx, + int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, + int32_t* ptrCtaIdxXyToMnLimit, void* bmm1Workspace, bool useRoutingScalesOnInput, int device, + cudaStream_t stream, int32_t configIndex); private: - batchedGemm::trtllm::gen::Dtype mDtypeElt; + batchedGemm::trtllm::gen::Dtype mDtypeAct; + batchedGemm::trtllm::gen::Dtype mDtypeWeights; int32_t mTileTokensDim; tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner mRunner; }; @@ -152,8 +155,8 @@ namespace Gemm2 class Runner { public: - explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeElt, batchedGemm::trtllm::gen::Dtype outputDtype, - bool useDeepSeekFp8, int tileTokensDim); + explicit Runner(batchedGemm::trtllm::gen::Dtype dtypeAct, batchedGemm::trtllm::gen::Dtype dtypeWeights, + batchedGemm::trtllm::gen::Dtype outputDtype, bool useDeepSeekFp8, int tileTokensDim); size_t getWorkspaceSizeInBytes(int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t configIndex) const; @@ -167,14 +170,15 @@ public: [[nodiscard]] std::vector getPassingConfigIndices() const; void run(void* permutedHiddenState, void* permutedHiddenStateScale, void* weight, void* weightScale, - float* outputScalesScalar, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, + float* outputScalesScalar, float* ptrBias, void* output, void* outputScale, int32_t topK, int32_t hiddenSize, int32_t intermediateSize, int32_t numExperts, int32_t numTokens, int32_t* ptrNumNonExitingCtas, int32_t* ptrTotalNumPaddedTokens, int32_t* ptrCtaIdxXyToBatchIdx, int32_t* ptrCtaIdxXyToMnLimit, void* bmm2Workspace, int device, cudaStream_t stream, int32_t configIndex); private: - batchedGemm::trtllm::gen::Dtype mDtypeElt; - batchedGemm::trtllm::gen::Dtype mOutputDtype; + batchedGemm::trtllm::gen::Dtype mDtypeAct; + batchedGemm::trtllm::gen::Dtype mDtypeWeights; + batchedGemm::trtllm::gen::Dtype mDtypeOut; int32_t mTileTokensDim; tensorrt_llm::kernels::TrtllmGenBatchedGemmRunner mRunner; }; @@ -200,9 +204,19 @@ struct MoERunnerArgs void* gemm2_weights = nullptr; void* gemm2_weights_scale = nullptr; + float* gemm1_bias = nullptr; + float* gemm1_alpha = nullptr; + float* gemm1_beta = nullptr; + float* gemm1_clamp_limit = nullptr; + float* gemm2_bias = nullptr; + int32_t num_tokens{0}; int32_t num_experts{0}; + // Hidden dimension input of MoE block. It might be padded. int32_t hidden_size{0}; + // Hidden dimension output of MoE block. It is not padded. + // If not provided it is the same as hidden_size. + std::optional hidden_size_output; // TODO: only compiled routing kernel supports top_k = 8 int32_t top_k{0}; int32_t n_group{0}; @@ -290,6 +304,8 @@ class Runner { public: // FIXME: tileTokensDim is hardcoded for now + Runner(batchedGemm::trtllm::gen::Dtype dtypeAct, batchedGemm::trtllm::gen::Dtype dtypeWeights, bool useDeepSeekFp8, + int tileTokensDim = 8, ActType actType = ActType::SwiGlu); Runner(batchedGemm::trtllm::gen::Dtype dtypeElt, bool useDeepSeekFp8, int tileTokensDim = 8); void run( diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt index 7e69505479..7e1ac7d13a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/CMakeLists.txt @@ -18,7 +18,7 @@ file(GLOB_RECURSE SRC_CPP *.cpp) file(GLOB_RECURSE SRC_CU *.cu) -# filter_cuda_archs("100" SRC_CPP) +filter_cuda_archs("100" SRC_CPP) add_library(trtllm_gen_fmha OBJECT ${SRC_CPP} ${SRC_CU}) set_property(TARGET trtllm_gen_fmha PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e409cddc74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b1ac149eb002d3a3aaa06193160f667ef7446c6119ea8a7624cf5010e6fa91 +size 716859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..15415ef2f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa580ccd54b55d86aab06d842953dfa2de1adcd0f08d01fa1d856e6500b42c55 +size 684095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..666f3afb91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b4ac41b6ce52463180d04673e91fac15280ef0cbe3b470307cf01603867072d +size 698705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a52fa68f39 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08ed2a80e783dfff875ac4d5dd3d5544707a16bb70af7e290e883741e30a7c97 +size 668307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a0167a60ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ff8dd8bf7eeae98428cbab52b0beb9acceae29d9fe76c3effdb32ad423a500 +size 709439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3acb3dd020 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc6e98d50fb43fa50924a8dbfbc0087e8d80e93587e28ed3b0c704ea02012be1 +size 657833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef065f45ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a75e31f4b04916330fd2b7ed4886c6bdb8fa49852385175d312cdf291ee852c +size 709703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1051004055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:779ee33a31a4881f732fff48ebbe76b2a26e99d41dbc860a70a4cfb0d2ba6d57 +size 665203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..97b454ff03 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e6861130024df851504602e4b6482d37674236d80c9316eb855381e0e0c1797 +size 762245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c2f616ca1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e28137d016387ffa13f6c403059a81c733d9574aa5db4aa845129ac7005dedc +size 682661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cfaeed70ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d52b82a2edd9ae0b2acb1f4bc1349978e22c4ed1107ee045a592b44c9c6a72b +size 682309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3b4bef8a6b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97eb744d02a4243f1397551924a01380de387941119bc524400ef58135ef29d4 +size 609436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8a6998b5b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07b1297d91824fa2725a354d9df206300df49dfb3a729a026ff651a83e55867c +size 647281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..48d8ab0256 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03e13af5a531d3b85b26f21ce9f0370e04571b3d24a7881bb1c5b621f157bc2 +size 574406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d9e076780f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28b66bd75aeaebeb1ddb9be6166369686e6313ac9867ea2ac83ed390d196f421 +size 691283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..373a81c398 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4db97c86ad2a6b669e4ad3ccae0ad9baf2ec1817cf9d0baed71f53876794fd89 +size 640469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2e7f23be24 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:725cb76d009776fc6f24976f56ae7be7ab732e5f0147a6dd6deaa95ffa4fdf52 +size 692339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10d4453f4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc144f2d52392e50e82335205690bc054203bb9e1ad478e5ef8548c715b124c +size 648627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d79e103882 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157a37c3925e884b2f3500709155eab584d7004e275df98384dc5a2f69a7d9d7 +size 743301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c5ac566f20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e008ee2db4889f7008c284f4d70d1e226248833191219995509afc427ff694f3 +size 666875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4bf25d047f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d43fc63094483328c0dfc2942e974bf1f0a6b7afde62b6ebfc874677fe6c36e +size 660997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0d83c4ef35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e139fd02d8968c03a353a1193779d80812b2b5affbbdc66f99ca10545ac937f1 +size 592070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5e54dd61b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:840bea09bf19531420d397055139a8b2612a528d1120f4e8ddc4fd5d184e6d31 +size 625969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4cc20e2dfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dabd42252c4bd6950b5fc8cf82731d83d012d33b30ac863f8047f3d4325f68e1 +size 556252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..282319bf2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ab714ab53a270d238a770dffbbd88566afb1e30f06ceebb598c7b9a9fde446 +size 716181 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4449f9ae65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7962446880b34073e4719007aa7384dd68461bf0a85625e1a96737ee0712babe +size 662209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a197dcb3ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1807c105d02bc2d5951aa3be505b0a0d727f02ddbb3a4747fd1816b37aaddf6 +size 716447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2e86749707 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c8609b79ee4e130d01129cef89c2ef029b33ba223b58e5ba4cd5e172472989 +size 669579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..23d2dc7f59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ff1ea6d051b3a1e333b91276a1f07a42b13d682c7de06841f1d5e47f4321f4e +size 814177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8186f63430 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e46798ba011c02938dbd922153aa413723985a35ef05b7d69420da91f57451d2 +size 754969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e63f656cfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2899d9173ecc0af9b1b653bf3c90afb86e92f5fd6c5a13d2970f309a6ea6a4d7 +size 671935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a2f92c051d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2432e8c8bb8857a607077aa2f1317a37555c2d99724bad8804324b43d028392f +size 605030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..800ab33c13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2438d61847c4d144b756ff2a0d280a3920b06fbf45dfdcbecad19bba97db58c +size 632959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9804388720 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cc0c71192acc4cf1e3238d6af12c4700bb7abad9cab74f608ee45d0fc5d4f3e +size 568422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6317d452d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a8bb2517274b16b60cbbf1c55e292a3a2d6c2828696b74c270dfb8c4c4c255 +size 698817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..97d161294a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd09e986f515c4d837e763ecb48998c724bd406ddac83a9d758e92e575d55290 +size 644055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..176a56291c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08eb909f7d672518a864cb72f7f7b63ad10a64b9f3fe38d5fb3da6e940539327 +size 699081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f32b7b4a23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aff75f98a82c708ece7f646632fdc1890a42fd13ba93c09fd625ab96dd2db0a +size 653003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e1019b0e43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b736f2e778a3675982ca2bf1480f8365b590907b9ac29eb8138b024625c6a43f +size 796811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8ca47831ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f9643acf43d8dd381d09e0295f2751ca66e97d478a8162f5eae0f9ad4461f9e +size 739183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83be310731 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be24a7a704c1a00e37a4ed18afe59555eb6796f5f10fca9103e39dc3eb853e3a +size 650623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3c258919b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:564a65acc48214153b4a464431d92cff8e45c4106e417ea20780b878300032ca +size 587664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..36b9eba5dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf683762ae90059c5bb48806e9e058292a45144fb1fb2c538c64b23a663cd19 +size 611646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 74e13ef7a5..65ff84ca75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c37fc2c54f52f879c473a5461df4151bbde361fe24b9a70a558b97532beb7ba -size 304112 +oid sha256:383f10436a3ab958e2d874d57a7b8ea73c05e761796359cca3b6d02ab0efa24e +size 551846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..593401232d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63eb6a57b257e669aa572f3469a4e86d2d7e3ef00adcc7fba59954a0fdd0ccb3 +size 646483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cda562c677 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b96345c64c93466fb4aac6cc284b77a6cfbc515650610f480640022f39ab0e95 +size 613520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3a431c0c3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f42f1167bb8153983d8b444687753493830074f83dc047503894fb09bed7c62 +size 629119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f4f76e0297 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a900379da740668badc53b4071cde246c4ee6a7349fd583d322f1b1aafa4b613 +size 597734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2f1dac79d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ebbd4568716506800388ce20cb04631ca97ab03c4cabee1df05b24e1402015 +size 606946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a7aabf14b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8082d0181f9897682db6c1867eab13984d8d47720d51ecd7bb519478c8fa3a70 +size 580624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..23711cad4a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b890a2d503020e5995bb55a024603dc5748003715974489f258e93c4d7b37f19 +size 628597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5b72f2ef70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5353e765dcf15f5c77af6c951b2c16b32f07675cbef2dbe2d1a03ba9378f221b +size 598328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8e85e6085b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0eb157a336a566843a5d7b67b219b5078287a27d6ff3f0183f5da42ffaa8711 +size 691869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9d75b4667f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ad74ca919d8b6c06008f2c1662c6e7e28d29d69ca8e75d0a1a8905ffc0a44f7 +size 611274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0cf5df49a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384f342f4fdfaacfcecc74fa58c1a1e5a041021bb4d1dcf40087d170f46fbde7 +size 619359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6bd65b2096 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1311f939b585c94554658af236f2d7ee9d3babc8091f4cd0bcbcc56318860f +size 531438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..442eaab243 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:274395bbefe58c2558d4534c57e5755e55c07f2810e25bab11d67d667cdda45b +size 555838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eef30e3755 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77410c0853a9d6cbc0b2999d18e939a9407d2cb097f7794b49b38bdffcdfbe4f +size 497988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..47485afa5b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fff0a3cb5e85385fdb385e010d4922e8d816277c8b826449c41e3d0c0e638d1 +size 590370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5c71729c5a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d79883401a5ab3435c3309c19e81f9cfc6488acfa33be6062c1033d7b801d4 +size 564048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8f9c7f72ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66383812d7c619b960df5efbc541038e8730c0f68b0bd3ca64e5396b01d75d56 +size 611232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0f6f0fae1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908783e8a4965f17c34f4ab7e5fe2a8c1fa3e896e23078eecd68f8ca9d279348 +size 580174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8d07ea80a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7f7c603148a22daf6555e80b617a5e0c0208f8e1bc71b6d3bff52ccdd63791b +size 672925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2baeab7519 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ad18b736cd2614fd7cc28e43e95a58b0a3ca11222a90ec5edbd0b5bc96587f +size 595486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0a45fe12b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1147d3101600078520cc22f6a9d94e94b3dcd5a22bafc33c882a19f68fe98496 +size 598046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2273b0f600 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:452e9f854020241c8d87e03198169dc3b496b459126b7e203c2c05cc965e2fba +size 514862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4f9c4688b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfec417a20df3aea65b491d316000d5c4a4421f762b1aa378beaed84ae09017a +size 538474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4282a1ff1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f17a66d53771cc584d83f08d81593b3a78a10d99db91a0d677f446b6f41873c +size 482200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..290d0781ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d38d19571c30ee9b7886c958f1a36e328fc146c6d8534b3ac9967fbdf0b43cb3 +size 627183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..987b620487 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:490f49e40e6dff2087951f51cf7acb376f0a534289450078eddc768892771834 +size 594520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0a48f1e4fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec407d0f941032e2863f42af54259e7a47a0912dfab15f7953e28bfc7452096a +size 636131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..baebbc9c06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a88295931adda37a69245bb95486f1c8c2e8b9028de12ad8ceb55217ef8eca6 +size 602704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1e27f1da63 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:865ecb0d7f54c04b37acdb228c2b4d0c1e1a6f6324aebfdb9f5a64b5d53c8a15 +size 743801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7a71abe78c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8773f2f05e51a291db0e4752208f54c80b59ea19e0f270c535fab1b14c64eff9 +size 680055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e7add76342 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6da25bc788b8f790191adf7efda7f25235777b571d15bb5413d409dfd3c75379 +size 607404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b7f7327e46 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1afa3135bef82cb681ea28ac42dcd4afd37dac9f9e7d2fa9704a69aa37edd9c4 +size 537342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a3de98c935 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a8530b23a366a211ebd573f7b86a3ef4a7abf1ece4c7fbabaa562beb70497e0 +size 567638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4fdcb06810 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:055b6501be55680c036698d808989cf124a2ea32752cba221c194b50ce3a3c1c +size 500734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8a95d2072e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed7ac04a950aa6a44979782ebf3db4943bd76e821cf3a5e40402b32d16f35a7 +size 609816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..341636009b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ea0e17ca4f22531f985c8e39d32e3c13b796016aad4da88803dda4e16955e64 +size 577156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..079502053e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc910d77c39dd221e1d3a13db23950289d2529cad367c00b1d4936f29e6f3b2 +size 618765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..21bc7b4f35 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de788b00d14153d54a35d040be97d5fb186761380d7a4ff27a27615db65828ee +size 585338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..41874d1cbf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6953f3004f1eb54baecd7450655ba09672d0c840822d6917a7befd244038ad5 +size 726435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4f7da2d278 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845a68e0343007ea9e560678e13ceb02428cd2ca1e9aedc74949286531d41fdf +size 664267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1e48eb5999 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7673bd52f9cb01b7db15f8c35d3b12d6308e1a09a02b6cd4a963df9ee80e5a90 +size 586092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f3c7c7bed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1deaff2efa1eaf5a937449e21af57895657570ee60b65829de775c5d319cb119 +size 519188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef072696c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae6f5e4f93661afe4c60ec67d3374c786af666dae94eaaf1acf2b2b9434c5535 +size 546326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..89647acfc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9439b50fabc7333ae49fa0a18a704af7209672520e90375dde7da94d1572be5f +size 482580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a2c5a1f10e..59041bb861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:008fff06e1c656ddd2fd9e31d2d6fd3a27bb0a60fc98286914cb545c4b873241 -size 1325861 +oid sha256:3b357a6f4c9901fbac1c16a9b328aebc7bd22d36728217adf327d06f636ec3a4 +size 973719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3b5840c18f..4e67ebde7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8a0e87c9ac534472a73eb22170159bbf53108a1bc5b6e6bae4200460e4ef62f -size 1194873 +oid sha256:04c9edadcdedfc2670ee824caae9bf57c9c95a8dd0584fdf439a954482874197 +size 877413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 59a26a0ed9..d499fdda6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65ad7a0e3272713469a244a36cf27d8158d20ca1d4193ab44d41c2b8ada29a76 -size 1321665 +oid sha256:38442820fae09138f1f7a6868a63076c26075abc113edef04ffadebfbd5857d4 +size 973865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index ee05a3bbfb..fe3e652b4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d5de9e0e756c983ec789f72a3acd1e09898c3d589ff93273257b60bda61c7af -size 1228763 +oid sha256:4f681f620d0f0f9bedc8e0bc7783fe061073235f1c423bbda1ba216e4ae8d827 +size 885353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 90f02c36e9..e913322d3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07ab803a1c52dcf6565cb0fbcac6905dc05f3d3fcead47ba9b5299df8ded9bb1 -size 1606895 +oid sha256:5e710014ef86cb1d61d16e3c0ba58e4e287f4d8c23a82d48ce9f8a035ec38517 +size 1099551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 339a35728e..2266686c79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2e0290c42041816bca27cb316195d94f2ba87e890aa714aea5c524c16754a96 -size 1476253 +oid sha256:58e7f0ba908cfb60fee905d761c3d15770281f0e63af7a5059d39c1380f7551f +size 1012963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3fcbe6be05..efd69c0383 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fad5e6b2fe34e5d4fa85bcbdf7e6de8f48126447861699ecdfeef2aef80c5e00 -size 1442339 +oid sha256:b9b134aec7f60ee1bedf4c60f673f1a5118928feae4159de2c1f1cd83c463a3f +size 1025621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c38c666d33..09b4aa56b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d88f5743c6c63f37cececb6f9d126304435d7cd7760db0891ef68a5e8e532350 -size 1264681 +oid sha256:03d0590aa511ed6b6c33c36cad54180eb2cebd4af66fee03731e11f1f85f7dbb +size 983729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 85fe099fac..1d58de6484 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60ad77a2d6656be8a43133e3880371a917ab644cdc490c8af001d38b06db1246 -size 1428033 +oid sha256:60fc1054362b865910bb53074d142bb1307a4bae9bffd74b8bd39072e9dc253e +size 1006133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 96535fff61..1ab2b0c1ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06d469c816ea295a68f2904ba8473460e7f6ce714306cc308a57b7e55c1a073d -size 1251165 +oid sha256:cfe5f9a517ba867e0ea930cbab710e53c26ffbea8938e9eb6d60f2b55a7cfb32 +size 963353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 08f5a6fef9..28d443b9ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:008b02be6b4741ab8a10ea26c8f92ba468e974244605572030ef4be078a603ee -size 2076501 +oid sha256:0ea2a288f05ef2ab28f9d38d4c1f9b0dd064823cb97ff66a21dbace1a49eab3a +size 1165117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 356ebd3714..c8fbedeb30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95962f12665056d1fb0b17669cea4aab4fc556f6663ed4d8a4123a7191afce7a -size 2064311 +oid sha256:68da30fa1a1de6c2c2da792f6d2777fac3982c86c35d2958d73b56d2f4196ba6 +size 1147499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ff2f1d83a9..a7389a7b57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:502f7e4895bad5c1912dfaa60701f7bb9ea43fc654c983f7e1022b8599cfdad1 -size 1346497 +oid sha256:43e974f2f5f1a4f27ad439c5937c674e278c30e9f76d4cf45c93306ec900f2f4 +size 1031749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9c165e687c..e26b18b395 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e7aee1206ed3abcda6fc86dfcfbe5341bf694af6024b0a8775297c4c23f136f -size 1244269 +oid sha256:1d337ef12bc3dad76fd67312c4da4e3263e6e2145fe9ac380384a65890a5e6b5 +size 912701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0ae2e30c2d..9c64d1cadf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd07ba4147709c1bfe19a3fabb0a03be4c6b730f9f20613d4ce3c9da06fcbce0 -size 2062985 +oid sha256:e795d01323db33d69df1adffbb83eca545592b537e66446842778b7bd97168d9 +size 1143953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 5b35f0b02d..03659eae45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dacd20682e4cf7db6c3c2ca4cf9016ce8d37163c54a27450654301969a01882e -size 2050053 +oid sha256:9e32d87f3e11e4444f9424dcc3cec492940fb0dce50e4a7c262994f93cb722ce +size 1126335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7129a687c7..94b4b1b3c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79390a79bbeee0b4b79aafda6fae7cde4f68e800245bfe4f14a440b9c462a40e -size 1331993 +oid sha256:b0c5edbc14b8780fba08abcf06c2bed306b1126f89d9dbf2128da8b756578ec0 +size 1010289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index bdfc28843c..24efdfb2ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c67b2bcddad7e2e7ab2ce3b64cc637f8941cbb459de604b4e3d4f2350c4c66af -size 1230013 +oid sha256:8a0b4e2abba0ed46dc126eb23a7b687a9ccd27cea62300c39f487426f372af84 +size 891437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f24dc8b3ce..eda9b4c377 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f2abe1f61bb220967f743891b605b1e3eb8c4afbcdfc05f93c6d0d0fc3fd5b7 -size 822247 +oid sha256:9708f936ca45a4b9f18b1baf17c194cefed9f5d2141d29168118903bb3a0b181 +size 835173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 97f7dc61d8..dc304d3702 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd7b1d65d5a9b38aa7a980fe9977ff9af2a8ee5069e12e2cd0c3f23019d435ca -size 742275 +oid sha256:fbd250ff1dd05cd3f1afcc5001204b5eec9ab00c6aa1aa65c0ca4b322233c80c +size 755299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d83597a3ec..190306eb86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc175af5423ca4f13838682880c94cad46fabf97862b45acdb2d40e4960b72d8 -size 822339 +oid sha256:db6561f83cfdf8a2dd32d9d9f2b02547ef37e827d4b58d7802b0ce2e7892e8fc +size 834673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e687a831e0..139fe19548 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31778eeae8343a7d7b11498b69c99067723ea8144804ad763bdfefd2b436c415 -size 761657 +oid sha256:f4f72ed44613fc8191041b4069bb6a87539b5b502fc0010204157aeb01e40b23 +size 776557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 22605ac8b0..a73b59c385 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72f2d3d58b044817124ecd0a97ae7ce66aa274283a697146ecc3fb9570dec493 -size 1507999 +oid sha256:6d0f8c0e7e7fd2820767b7225c73728788581603ef13324cb03be35337f7abe8 +size 1100901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3ff240e076..7e62d713b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83e2f95b09c1fab3cdf9d18fe5c939ab0cb58e2c543ff923db532f9974dc7aea -size 1296105 +oid sha256:d07a71f13650ae6f65a8bb64fac2cd165ee8f14fb844d051040d7ddd4f0a52f6 +size 982987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4b47d8eb23..2bfe9b84a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fe3e90b151f92503ae974ba8830a5669958012557765ad89fcf55c6a33a7eed -size 921485 +oid sha256:f27d708aab39938556932c222ccec533a4c82c3edddc8b3eab33b4604adc65d3 +size 846499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bb637bb743..9389320afb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0e07e44286c0e7ce195323d434e9be6093574733fa0c79978f6cb5ec5da3642 -size 706977 +oid sha256:aa436bef48e920e3157a0b77be0af0423de54a12cdb3652547f5769ff022b2a1 +size 717681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1c24c29c4b..6e99dba65b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d03065c2f8859bf0ea937a847d2762b11f02cf76309910662f670e9a6d818b68 -size 848223 +oid sha256:23316ff7a96ca692bdf9b485963e10a190056de36c3b81d151872a631ef61835 +size 763617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a8051cc3f7..391fdd3541 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b5d50b30c920ac8af2396a9dfbc3d0e69bb5d4db7a6da73772bc329cb789e65 -size 655815 +oid sha256:c0a78bd1e82ee0e77cf8a3f59625b8919c2e5fae972d3caaa8407249eb215655 +size 666915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c0c029c259..e51a1a3359 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c21e4f51dcfe442944471342874f0d362ad28b39efe306af16f887b0abf8351 -size 808729 +oid sha256:4a504be8c2e522f884363b827c86d12ced39d8c3c480cb77c15918fe00d7a62a +size 812923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1fc136f0d8..8be6b779e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a8e037902540ee9975e84aa3d9c590257e970bdbaeb873bcd848e8b76194bfc -size 728759 +oid sha256:3f9d81332b9e2c6c5e5381efc8f5a699b7e30f0adb7c69113925f3c0b6b9808d +size 734185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 10d1ec1dd4..54c280698d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee80ebf1b28495d19efb97e23fd46deb15c6cccf92491dd74351052ca091e449 -size 808033 +oid sha256:b90d04cd9fd0819de0b7a41daa29587e4894b2f46925c4094b7776e652414360 +size 811683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 07e46304d0..9dbd4a63a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cd32fd583da25b3ab35484ef855ecac3aaa9fb6a07ec0c84f0e5b5d2fef1666 -size 748141 +oid sha256:99fdd098f2869be7bc238c61141ecb8ee558547cfef4f7f8a20e92d00c916276 +size 754307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 008f89e13e..52e3206290 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:782d9887e66ea28188c70a9792b55842f2b620a9031ebe2c0b74187d82ce9bc0 -size 1493841 +oid sha256:0c0e4bec935e771b8ac72215872dd17a48b183f80efd4e4d431a9f062d9a1cbd +size 1074803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index b4a029659b..57bdeaccc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cca076110efd588ab511244d79bd8cb01bb6748c5ab2ac1af13bcfa322d22fe3 -size 1282637 +oid sha256:c5da2c86cdf436848c5cc6050b1ca9e62a8d3051392465af89a97348e0379a42 +size 963647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5f51e1be23..dac7eea3da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38f6815d21cc388585a86bf0fee05b126546e076f69c3007aa3658c3cb858853 -size 907869 +oid sha256:988d2d2af5b8aa671c26d47e950997106fb3a21ff67fbe6b8906e0625baf0b98 +size 818133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0fa0b52fe0..6f0719c5a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ca6736f6bff95232b1e8661aa93615a9ce287d4804a85b7c5d4a9b5e2f90928 -size 693459 +oid sha256:e2e5b2ce4acdbbed6c12dc93fbe97cec0522088a0ac97bb842a283a1fb7bce16 +size 696469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 60d9bb4f14..931856172f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62712614b14a1bc83d2c56b04b0abebcdfd14a91e1a53db15cb521c733e43909 -size 833967 +oid sha256:07367f29532ebc17048079a9be5dbf5631ac34305bb174e87210e151df6d9e01 +size 744081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2805d5c422..56511e02d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6407cdd38a671e74795dab7064839f490640a1f8285bfffb53fab87a155b3da -size 641559 +oid sha256:e7c0269873f5f85a43da79c5a973cdbacedb27f5c5fcdd9596a757e97e8fe318 +size 646491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 765d01e427..11035232b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1335f151d77fac7e9dab24dec20935967539eef4873f8a4e51f67d930d1fc2ae -size 853855 +oid sha256:ab22e5b37a7287aad5d66cde581a0a9ad580653f7510ccf34e446b9f1b8a946f +size 844531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 75f7273164..f50bbaf7fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ca265c2eecd9543d707c0d5604607c98c62f1a5963ea4591d40de7b6bbd9e18 -size 769937 +oid sha256:01b9b6d665ece956dd2db55aaa5491336004473c6d39db7453a36c64c93eec53 +size 777041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b2bbfeee88..ae5e019c81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bc4c5199e7aef8272f8502efa8349f1dd913ada716e9f4adc22e87c6594395b -size 853157 +oid sha256:7a212bf5dc9d5ccb4b2fa01f0e5c55944915017af42b15086cf6ee8a03488445 +size 844031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9e812ca541..6930bfdb51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:775f0a93a2baff01f97792add4914927bf3be06abd592e9b86ffe6de49fa15fe -size 791735 +oid sha256:61ec00b0997bb36d8b70e3e285bdeb4507032e13d692b6aba1aa0e9a53b29586 +size 782807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c9d9930304..7bdfe94caa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68c0939ec6713025f17c2e7636c63f9c1e3b8433c84015042fbac8f85e53f538 -size 1724163 +oid sha256:6ea5e1ffddbe1bdb8a43b350322e516750f2185d5738d2d82f7f9f255b4481f8 +size 1155497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index cc03c72a0e..524d9e3ba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a03c503b7735d86aa2e051267f42fa0334d69365e140ac85ea9a07f3b6fd212 -size 1545569 +oid sha256:901e6850358d90972358c023335ac30905111b394a29538e95776d23e9285546 +size 1113211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ca0534e4f0..86b936f5ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd90c55773ffe8359f31c0961cb3bac6fdd9e0647b00ed79ecfbc5bcc71f9405 -size 947075 +oid sha256:42c33ed95c4d7b2baf59344badbd3f56cfe9a27912eac33d2d14a28d1d7cf08d +size 824185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 94918745f4..7242e08b9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba9ea159488844f5114d640016a36b8355331f5b2d25977663d6bfce4c0fb6ad -size 728865 +oid sha256:9ee7bd77b91591b0798031ad16a3ec7661d254d1890fd118ce282e6adaeee2f1 +size 720923 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cd0e90df7c..bf74ebdb68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57e04bae9efe1ef0d5fcc8a215ab4b6d6427068a8cb47084e2a6c979549f2a79 -size 873417 +oid sha256:3bee76ed77eb7cb9441bda579a7031031848b6a276988c147ab181b18d6dc12a +size 764983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b62e1f0a81..32f87c8486 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:868cd0114bd0717d51b0d12e896c45b2d6e3142afb1fd464f7cb7a7fe2d9a056 -size 675435 +oid sha256:ce8fae585033e5c7b0b1caf3a08c8497603a35eb1796f0dec78fa0799a93f15a +size 667641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1c1d9beafb..ee086efd55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e87507c13a6dc0737e1b77a10c980eb2ce7502c4b974d4e0f37566774a8106e9 -size 840337 +oid sha256:e57a41ad79975ecd85c3796666a026e408bfd6427a8936e5c53f8f2562331868 +size 822281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 295590d2ad..7924922a1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1857bb1b91f1ecd59517cc91f5657ff556ada1330efcf3e7c42dd1a99a47ab7e -size 756419 +oid sha256:a0208b7b40d2a6883d824c52859ab0c545a629d84326d61a466337a2f74c01d6 +size 754001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c64fd1a06..bcec2e02fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6284e759708403e0904bd9f2b2970fb33147a6554d73df70e3c5bbfabe3f6bad -size 839641 +oid sha256:74844a54b1e8f2e71200725fb2c13302390cb517ef5711db72ba36ae9b606d23 +size 820993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f5b1e66f6..b62813dd95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dc7da8b845adf091ea684ffaf5cb185e9857329a91e1d005fb6f1bd9b2a3b0c -size 778267 +oid sha256:72606e42ba00544df36952cf96fef3b01ecdf7b94d4cb811b3d0d39df78502e6 +size 759817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d52eb52e87..e9422c31db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b1dc303b350fbb4eb8972b53a8dabd5a5231a5bae589a68186d373a1c79c352 -size 1709857 +oid sha256:aaf02db0c8705e7f764ff29b6c7a68bcb55dbc7ca84ea52658bbb6b8e0f8b43a +size 1135221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index cfd6328c66..271907e5a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf13ab8c97c37e75c7e7661c8967c00a1f0d9dbf6ee40378047af47870c62967 -size 1532051 +oid sha256:a7bbba54c5b957b16a05c946fc43992062ddd8fc50e53c2779bf0111f3ffc449 +size 1092885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index be16ec0d79..7f42161624 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b970d89a168ca10513810f2fadd07a6c3eb1c4c597be8fe61e72ce31e290d908 -size 933459 +oid sha256:1dcbcd3e292c3e4b3a57c6bf8b5d337c5b72b6023c29fca9fe0958af1909e63c +size 798187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c8c53cc4fe..744ab464dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79888c4c47a8d87159cabb703ad3ef5f8003e53c38a774fc53b307f0726e8a9f -size 714657 +oid sha256:f234ea895feba907e833e9dd56d6a69dc5625cbd347c85a305d618239139d17e +size 697687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 132f715d53..c40ad877b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a625a81b73adc8b0cfcc29a60964a08d803f9e7c4c8a352a371f3404c7565c09 -size 859111 +oid sha256:f77b1e5ca74926e1d2c9623be66aa4362c246978da8c6ac8cbd87ebdcdca4567 +size 738985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2bc31eb17b..552ab50c53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c437d50cfca86c24d2b62d3c1e43baf3939578b400137f93e490bb717f3f86ef -size 661129 +oid sha256:d7ecacf69c8b1653d553c2a7c267fb95be3ddd511253fcf2fa909c0c5410697e +size 644405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d2781f9811..e7b60a87fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c554d5e44672f450410e3a74586c8709ac2b297e9ebb8fa57d52b6913b899d3d -size 915901 +oid sha256:51afe8610ddc6adf9e9b10200f5266c127a51cd8a02c61f1cf832629ea5e57ed +size 736475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b78319530d..6a06c21088 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b0c045ddefac54dbb56b87055d4a2372946d561640b8a438f418c1114ef4ca6 -size 811799 +oid sha256:0d5af274d6fb820fd430f0519b25d6b5b3a573f61876331e8d22ac25d6aee4a4 +size 639133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 31c5dd9958..52dc2ae5e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7a0c318ed1b2515c1b74bad2b1b0147792bed1fe96ee4efd0d9b224196a78a2 -size 916935 +oid sha256:e60b2a56b627bd21074b6ad092052afcd5308fc73a820c4c5fefcf153c1df59f +size 734353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index fa1d0c36b3..06a888b757 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f551f3661d293bac12726605d80ca567f0e83bdcd0c4ed13f8c3f01f6078b34 -size 865373 +oid sha256:1fbe3b7d9499cbc072d1a594f4c6c653c65a74c802e86e74536bdaf3bb16334f +size 656545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8efc4e3597..10d762f5e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cafa72dab961d2435aeaed9d0eafed029d800d2580d9418d0e2c4e9ca3397da2 -size 1060677 +oid sha256:86c3607b305973a606b09c16377991116869b9cbbd9c72a7bfc5dfea8e6c9c3f +size 804687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1902bf5a5d..1b57624cc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:051954968fb8a232383ce80278d5164f5f6ec1d40d306d55977e8b2aa4461314 -size 956527 +oid sha256:73cda19669b45d902691305df7d63e2468ae5c52c7cc281af409a9597415ad45 +size 706801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f3b50a3034..200f70bae2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4bd2819f128073ca94a15b10d8d5e395407c4599464f00d25e2a3c3068ba946 -size 1023647 +oid sha256:a9e7c74fb27f805a57065be55d7c8f135378fffd287817a884290cf42fc6fc2b +size 848217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3f08842f20..e5fb34a92f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a12393e3b8f9a6c974048085f6c27b4f39b6abc97f5099bc9c2fd4e0c8eba14 -size 900749 +oid sha256:ec6ee5e2d0ac273d7c4acacaccdd50658b7069cf28c99e70aca8205e98d37ec3 +size 795029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a6c46d93ee..dda891ddb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0892085759218fa61dd9cc5478fd0f90b40792681a027673b5baaf236ee99181 -size 995873 +oid sha256:eb0f4f9705c58c5e2549686088ec278cc1fc43c0ff619cc81282e2f66bf7a254 +size 802929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 81efabcb0f..4ea1a9bc1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c352652e6a93084ab265d37c6477ddf594638e914719cd5f4ff3ea1349a6779 -size 878501 +oid sha256:a4dcf114c9ffec25e3d22a019072930a2fb7d591ed57615b64973091c8aa5e4e +size 752109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4c11c0d68..19065d00e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3729fe6aa08a35db0925f8f807a51169c6960aafd95fb2c4bd5e20e0964b3afb -size 976907 +oid sha256:4c6b72c9b8b0cc8b7d5856a708281a69261c032e91e44112895e6f41cdc79afe +size 1024119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7f97041072..3db351190a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79d3a6fd006038c0f7ad8a7138ce2ac12b93e6b117c352be75532f7d483de6f9 -size 916175 +oid sha256:58a2c86e7357263b53b674e69aabedd88bf03809a17480b1fbdc34b8fafb0256 +size 940645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3b26c768f..5d2482f5d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:569c995665f25dea61fbe3d812e8e46be804512638fb93f0ec1a194d15ac688f -size 970291 +oid sha256:8ccbc10a10967ac893fb55c426eddaab040489519ab5da280d3bd7f0a2e71dc1 +size 1018785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f9f2f44ae3..e9b6c2f45d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d407a82b8f54e63eeb6754048e4c1ca7ad3ccc6629a1f3941026a3d854febeab -size 916613 +oid sha256:57099b503ab38581b60eb6c6932bb7f3d91dbf9a7acb4d628d22d21f3fd6f388 +size 967081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2962f46adf..c1edd46b9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92220adc6154e7c8c4356006f87c6ba9980499c5cb2558bc19b2c82910e36e6f -size 1092811 +oid sha256:a439a23cc2da04542dd839cec8cae90f2a7170460b2a1134152ef01b69cd524d +size 919897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 651cc209b7..0ec6b77f0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcf640970a284d11b792bb774ba3572846182519c41ce35515b92cafbd487efd -size 951167 +oid sha256:cd68e530d2ac17ce1d1801479d55f38bd87f9f3c2c9c5e6ef467a70b60a32f08 +size 816141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5dbce7af48..6464851e9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ade87b0432b96213e96ef4410635bdb920abee26fcac15b331d48404a74e9e8a -size 1126663 +oid sha256:de6c64aeeff521e6216cc598c0cbd8440795272a20690055499740d9127336b2 +size 1009249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 32fa89878c..e67708c139 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c27e998d7d510c684fe9eb33ad4d1320a57f33a09ade9447d1d77ea102e68a8 -size 873229 +oid sha256:9f0ce6f32f102c38514dcca1761cb50c0b1082961af0e71174e933fa65366ce4 +size 913535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4ec15eaf51..5740bc43e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9bb817632ebdc48d8db227b4586803707853588f04d88edc7841bf0768161cd -size 1071605 +oid sha256:1a1cbae6cac64a2540e950a52200dd8a61a8169db89e27ffe0caff33fce5a444 +size 920941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b5e5efb2d7..18752f499b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f09041fab501f45c38c69afa12fbcf56fde851e9bd27c3485ba6c89c676edab -size 818073 +oid sha256:573fc9fab35f40e7cd6e91c6dab11e0a6bcd9593cc7423b63359417a3a919d8a +size 842197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f3aa5a2946..aadf552371 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3a92878f5bd8cf2177506585449eded16f233487d1b3d05eedb6331270a3eed -size 949921 +oid sha256:1ee6d029cea4f82f076516d9234e235474dfb63982475d56fa2260bf29c62fba +size 976809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b3fbe6e3d9..770b35f096 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de40e102f49e7b17392fcad90e44066538dc4e030cb0de565e1015fd0afe5c50 -size 889191 +oid sha256:f28fa07403dea9ce7f0ac40d4a24a41250e4d887f3068ae1ca18ad2dc9ccf802 +size 897725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7e98b002df..7a24ce4100 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a56a8bea89fa60e0a974f96692728943b8926ed6c5f297555e5d729eb144c79d -size 943305 +oid sha256:36e05a810ca654bcb26c92700ddf32b626ce9abdf1243d8d632c2f73f29ee2c9 +size 972263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 564f42466c..bf642fb30b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:774107e5d7c601497079d67cf4d1f1d89ec6b45ec36ee518def077db6beeaa92 -size 889629 +oid sha256:3511303d947d3dc39cadddf3232451165a46ea1ee6378285d9957685213c9883 +size 920511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0f68b6457e..ace480bc66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a18932a8becf6534c6217ae0634680f78f881c795ac342c6366bdad4e11331a2 -size 1064147 +oid sha256:16dca955a71356ba391c84922889f8e4206cacc44756d99c57be031d28380139 +size 873819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index f71706f641..08025da50b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5486b0ccc299ff3bac297be623293a5bdde81aefd8ebf0da75f4de7bf793c24 -size 925759 +oid sha256:195bd01606dab1bcd576f7ac535d97688591c701e5e10aaa34173d85cb3f83d3 +size 772777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0485bed94c..26f721a4a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d812c674ef3b916789c19c2fd04e267f0f62b0b58a5b133b35fcbd71beb7e77 -size 1098937 +oid sha256:6e3e8390c6f39ad075b3ec92567a6f3a8773e22ccb03b0bdb95b4375165eca18 +size 958387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 897df6bdb2..d2b3f841b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba5664d49631085b2d8b8f6172ada20b275b1632bbc78b1eb7ecda8b2dd79a0d -size 847133 +oid sha256:0b45ef2cc6f09fcf0a4bc7db7e003b865ba42509988f0673e61325bed8f7122a +size 867457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3d3f1d34a6..924f11f904 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0b2fc7e4d23db1aa3dc649094c5214983e465890029057a6b81b46b2ef3f399 -size 1043139 +oid sha256:9e23564cf3b4cfcf7d5d0dae3b2d2b2ed57e34c1826138c570fa8826d788712f +size 874863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f8c2ceb6e0..40e7bd4fc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc8d36116836df3066c15ac43f4cf68ae3f7fa0ef65b11cec4db2c0d96aeaa8b -size 791087 +oid sha256:d726c49e5c795436401b887ce543de24579a8c7ea9625ce143a16b381001cf30 +size 798783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 50f47e1795..557b3f6fd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c7bc981c3847e9c1c902f5efd6b67911fb0e5bb0c7a9223cf4cdefc9a1e7061 -size 1005357 +oid sha256:159255b049d6057916bd8e2a05dc3901c0b4eb7244c7766a47815a6427064839 +size 1033477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 73777cbb21..9f06ff7a6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ca3b3f478b4f589b6ecf18da2cf91b48034bd096c5ef4463cc4d9526bc0e456 -size 943047 +oid sha256:1f2506e1d415d96c01c02ee895ec8bc71e5bba499e81b81e1cd2113dc99a51de +size 971709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 53bc43bf78..b1d30ed8ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86e07474053d6d94782dd9bd496ba58d476b18cf61b71c73d0690f0a364d8218 -size 998741 +oid sha256:98a2012cbc9562de4ae05fceca7df88648f3155c40857540719b50431f6db4f5 +size 1028933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index be7001d8cb..fca9a50805 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd2a08c743c912638e459c0475bdfda0fd60fb1e77fb0922e09e4bb68bde3533 -size 944275 +oid sha256:5e1a2fb990ff628fea42d19181330c9d3c5fb37a6a237ea1fe19928e57fe306b +size 974171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8c1ac3a8ce..95ef90e112 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90bb477f524cb623bf3bdf714f7edb926b8ec411c778f6dcab08f6a9f82fe863 -size 1166253 +oid sha256:4add8397332c371d7649680dbd1bf3b7352eebc60284040eabe6294134bf496d +size 916675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d6352a7c35..8e5c0fba89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4d1b235a61cf0e16826b02ad2849e6e4d0be2180b0650baf7da6e499b706d41 -size 1044293 +oid sha256:dd89312df6315c1deb7f65aab77ad1d419a108aef8dc7f1a1e9ac6746f78a35a +size 862795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fe911f8339..8f4dba24f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33abb86c1e2547b5a3cff24dff6d6e0950883465b2b837c445800ef16b4113b7 -size 1151463 +oid sha256:7560fb4f321063b757f213e94f6ae5c2e0e5a6df2eb2c879315b4a3ee647727d +size 986047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 65644b81ff..cb0588a1e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40b7ef65636e0f63407a465f51030472664dd758c13b20c33bba241794e745dc -size 897831 +oid sha256:98fdd19c19ddda2da10d09d42cff64793268ef1c9f637c590390b882c83caf50 +size 871537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 36dea4dfd1..1095a5a9da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:097eeb1cf70b9ab89970e09a88b1e3bbf0761e2b3ad6b9c62ed518adaa31edab -size 1093839 +oid sha256:9d53dd91519c5c46137c16f7709b40c739b5f5bd73636fa135961780edb0f6d2 +size 921665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 40a53bf561..663cdd50e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b790d6b8ce909fa4aa3c070dcef4ae73f08e5e9a3a93bc56406a1c3efbe177a -size 839517 +oid sha256:6c8d70439dc13eccad831a4db18b2b1e8cc0dde3aab97ad2800cee2170d7fb7b +size 815295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2ab0be70ae..35fe5df9e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbe294d0ab6aba0b560355544226df31781c2081100eb279aed67e0c77036ed9 -size 979161 +oid sha256:47820675d2a5fcbd08b1f8512224283e596848c905195e50f5d5ee17c070d508 +size 986955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 13e22d5576..d2833fc005 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e8cfee367d3acd076c382c818681c3d53696a64afa26f1ded6182154554a4c2 -size 916061 +oid sha256:0e56b3b23a53ea00776a6ea3e92412136408436894bd91757752e2ae3f08b7ef +size 924349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 442835e566..d172ea6308 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a94df368cf71f73eb11c38089c974a3739ad04c2599a142f8e40584a193cbcf5 -size 971755 +oid sha256:3df16dea5acc4f415a5299645fe9052e15907a07cd60b56e8c38e17e0dbc8d9c +size 981621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 286bf0a659..9716fb8bac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c4db5784dec1a9805304afff28f4bb0d2acbf317d0e4a384de29a850929ef93 -size 918079 +oid sha256:ad67c34f39ed0e1e86936104220dd8d8cdecd7071fe502c0a5a2237956998b81 +size 926811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a81e5b2997..7ce0706040 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6b5ac940e5199fcc9410f683804d38cf4222d6a760374fb57c8394590b14e5f -size 1138577 +oid sha256:58b68690f70d92048072ebcaf1dc216142a1762915233c41e101695e906e7981 +size 872225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ac6d389027..78926b0d99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a1762967ed59d330897743fe1b4369357510ae0b92640ecee0082e1d8bf935c -size 1022093 +oid sha256:649290d9e0716303e5a8c2e13b5089f50a43c34659eb6d891c4dbbfb848cc174 +size 821799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e57c213943..3aaee76fdf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd726d1447e551464a6410a20320d6975d689d015f3359d8bf4272fd96147e46 -size 1122947 +oid sha256:292abc736db57bc3d2dfad64f9934d691a54279e59422b239727c5863a693f61 +size 940563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b64357d349..92e38687ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca7c81ca9f68e08dbb8da6f7b65d02d38f20454b1ba12d9a0fdec0ad95a8aaa8 -size 870847 +oid sha256:75a481a8e7c63c479068d9125d78f2df2e9299bd795f9a65ce364e31f0bf09d1 +size 828765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7057818f7a..4b337c9f87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e8f816626f1120a801dea9abea476fdc12431f3cd4fb50675fb27b88a1220c5 -size 1065325 +oid sha256:065ca1b0bceca3b84a39694bf41c1945f678130760b056a8eb9df981e5029a13 +size 875391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f9bc1fe03e..3e07e9fc25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:419d414d32a712e8dec0272aa979fabc2ca708fa2b2f95cdac08661df143bb7c -size 812483 +oid sha256:835cc34de0e45f9df2f5bf1b7c806884179abab78a8aae40b6eb60fd238fd19d +size 772473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b92277822d..c75010d9d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:091816f24150799db7889a443b5bc026afb5d73c1da9d19af9cfad24304592fa -size 1240907 +oid sha256:6b4d17fd20ad34ad3ed509adb4722954d549e169c340fa886753dbcf38d2a1b8 +size 924533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1c53da6307..be2a46c862 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24437a61c2a32751465a7c590c9dcae327a1340afb160dad3a421e27a0ab034f -size 1127827 +oid sha256:f90630da5546d57387c3e364fcad03cca32715a69c192ce2a29fc09164ccdb77 +size 838487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f8c535447d..39abaa8353 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adabcd9a86156d9a985a5603d41619f404cc2976afc00a3de054599d14e9f0d4 -size 1236515 +oid sha256:29fbb70ba41e8563d5d72af08d907707e502ff7dd9394692f115cbfc140ea503 +size 917427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index c92befb34f..bf1dac952f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8494c05276a9dd7bc5733013daa8841252a2bd08278c759beca4f99bc8dcf899 -size 1173803 +oid sha256:6c29b52c92b5980cbfbc0750f6380bb84b1ce2bdadf1898a0c43806d3255ff73 +size 811747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d0bd931be6..c5e5a66989 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fadf88daae9e44027a27a9de34eab8b88b6429142f07214c58bbf77d8f9681f -size 1521941 +oid sha256:01d00e21e02203b56777a4082ff1441abfc5c4876ba7299eaa3d7b293e5c9b3b +size 1050759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index de13c5bd99..f5549fdbbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56bf5215da975de7958a3fbe29c558251c8d936667297e80bbc5078a70ff60d7 -size 1408417 +oid sha256:f24fc1dbc17ca29b649dfcb61130627dc72d4f89a8edc5b623b79439a884a9bc +size 943599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 11f65cf54c..3d135da6d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e740d82f22755a7eb8bd5ce28abf26ccfaae461b1ba736cc53b9bada8250ff8e -size 1332027 +oid sha256:ce12e35e46a943a738b559cca8d010448596058aef35e63d94e7632d58ad8106 +size 931441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 46973f7a43..d2b3291ac0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e387fa1571d9e5de2f34742262c4e88c3b935c587ac8b464659c5e3e954e39e5 -size 1194675 +oid sha256:7a15269fe482b2ed7d7ec751fa121e71be43797ae653ed623d3eda4ad28c0d3d +size 885159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8289554ea9..6cf104d02d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39f7fdd02de12132a144c236a90fddbca75b5c411615ce5203a5bcb8b0b518fb -size 1326403 +oid sha256:76378325966f3e418e40270c5529448a55052f660604d9e71ad8a41ada7cbc9d +size 921623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6ef498100c..6ecab7834c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bd833d033d2f82888dfe289de4ab73bd1481c14eee2133bdb6b07c1631e9d95 -size 1187523 +oid sha256:42dd431af60c3b6f300a179398cc69fa59b7445ebaa405aac176bf55895a0b06 +size 875095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 78f0e6c6d1..74e15aced0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ae818e7393afb1ad3c2076058374831785f1aa4732fe2e89b60e82d34e925cd -size 1979559 +oid sha256:a45702a2d3ebaf6abe63f5df16a8f5d1a128b9ff75723864b3a23a83b4399043 +size 1060035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1e9313a08b..e578465056 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d37e95821f1e1a2c74e1f2e04912e7f2e3beaeea590b2b870131852b998eddd -size 1974127 +oid sha256:5ae6f392450a4797847ac39b4f2e203cd1cbdb5400fc347546e4c926995e3ab5 +size 1051445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7b38f32365..5907da26a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e89269fb916c605e837bc94756a3a7082fd28494aaca478d41d7631f5cad0dd4 -size 1257645 +oid sha256:958f8fcc6e70201094cd9149713e8741729d704a261892843371d42bf3ad8d3d +size 939939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 33bb23c042..8fd181f0b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc3c9fd82d75d030231510ed8113f7437a094ee17ddaadf825e71f8cf76f0e21 -size 1138695 +oid sha256:effbdc2f990f80fe7e1a0250b9e5c72d4027bacbb7f00aa9389040ec53a78bb3 +size 828881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 44e343fbdb..5da81ff2a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb101cc533725203e23a07a1faddd1d46e63a37245fe9df7b6cb15c0091c7c5d -size 1972407 +oid sha256:f5c56bb7bf5f1c5127d5462517584c17fa5e5794e8430c6b60cd85075f803543 +size 1049183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3aea085e9c..90e628a7bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6556e51a730fd4151a5e363116b8839b278ad396b9d6eab7a5ea3e28ca75eb64 -size 1966233 +oid sha256:db40859f5dfae05de01019c08bc20a2d7adbce05bbc8f3fbf72ea525973b14be +size 1040543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ab5c99bbbb..0c066020f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77fcdc8b12f89989973bdaf304d7ddba2d6d2c61144e36764ec5e6ba0cc4be1f -size 1252071 +oid sha256:2df4ddb12708befc3e7891edf3e15903c63e23f0c1bb6a37a507aa8f5c239c22 +size 930219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a3aa0c7032..6c4d13c74c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37ea03bc14703feee005f3455cdf91fbf1a30cc143c24301e5a5801e166f580e -size 1131541 +oid sha256:a4398069e96676e19c464be8e107764d4dd94d0af2ca4d5cd0ff49b058030147 +size 817979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2e447d8aa0..030fbad6c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:380a380e46d2851f76844ab8bc408dfdddca3360644dfb0044f3b041fd87908d -size 753919 +oid sha256:4ccd80e3a91e787784b87734301b55bbeeee9cf79aca94a2f17c4ce37e72e7a4 +size 758803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ab7dcb4526..770e7dcd53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e363a55992958eb56754c98e4bfb4cf4ad8b0b77dfaab992223f6faaad1afe3 -size 711145 +oid sha256:e24bceba3af78f518aa55b6eb9614c7cff92a4c73c86a6275da30b850ea8b77c +size 718791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7ab7c8b7c5..e536a09d79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d97ce544502d5548a936bca22286b0bc5562cb2d98a2fc5b55ee433fa847c2ff -size 778037 +oid sha256:315318b84b438e6b465736a5543fd0a321d3be7e5811874ea6ed56d55d8443fa +size 777345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d5e2ee86df..34e3237b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5f98b0edbf2df99d58f82c354c9e476eae98c1fdf9f2feefe44699b126827e6 -size 737039 +oid sha256:f713ae6cee5f70b8add1bcdc91fc1134f121d3d757a0f438fd8b1be4e2325ab7 +size 739011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 29950dd064..1cf614f89f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60afe13561b2a155c25f133c9144b458d61113caff4332a188980ebf3480cf89 -size 1398329 +oid sha256:5edc0850a06931cf5b8702b754d4380df1afc3f9ae19090efdad4d900dffd52a +size 990837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1e6048c3dc..76448bca2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcee54ad752c3448ca2587425c1772626981e566a6804db197a77df07837be58 -size 1249829 +oid sha256:e1719ef606970369e9e9d0e025f21001e4416d1188a237b54089a99c08fbc4ee +size 882789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index aa45677999..a2becac4a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb1f3880f5bc3b7614dfa0ca46558aaa1da248d0f9ac88ad625c5dd16d75547 -size 865343 +oid sha256:558b5ce0c388a81623d9fb332c83f955c2cefd17758b6500ffb14b2b39f88731 +size 774619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b87a44ff90..d9be7e5a3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdb5a035c13e17c0533a0f6598d1120e8944beedaa9ebed3b8ef20b4795a5ee8 -size 667507 +oid sha256:314788aa4a855b5b5c9025cabb6577c144b2de6a0963a76ad0ea0d6d8cd0d996 +size 672787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d23b5d254d..2b11dda440 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a50a334a381a6a48b98e2b2d140b709f489ebcf834a38a15f58ecfdc0de377cf -size 798839 +oid sha256:92b911d2b33185fd80010868bb5cedb8de562c54f4968cf0398ed61e807408e3 +size 729773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfd6903b5e..44f19f1a6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f3a66918776eba95f84314fb64774e426d149544da16c1a8f9e4a0c91bf17cd -size 624043 +oid sha256:a2a302ab649af03bc31c49a3148098afc6118926ede426e8dcacb5795c59ea78 +size 630309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fe64e34d8b..f178b320ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a3708594806f79a3c3b3230070009d984a20c5b9f87766ffa84db13b736c945 -size 746765 +oid sha256:edb501dd8231a0976c7b1680cbb85f596f408c80c449247793a82d9054ff7bc3 +size 747949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 22b7808c37..3a7e072fab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e42c7bc4c9f73df1a421339e8b78c6041f27d1b40122f63acb035f1679ee8558 -size 703991 +oid sha256:4e176b0f8a1ecfefffe90a9712e34649299adf12ecee872b08898d98ef96fae2 +size 707937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 575eb4668d..c8f3129b0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29f8b427a227598d512d67fe71a917078f6f6f44da56593d13164d3d0dddc5da -size 770883 +oid sha256:57a186c2bd6ca1bb8ca32ca6e9b2934d626bb71c5bbb306c1ac61b23431c0bec +size 767281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index be3223199c..85e5084ffc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15bb426dc985c8f0a01a6621574c80eb28a54cafcdce71cf97fcc4611d7ee8a8 -size 729885 +oid sha256:9a9fddfd58b4137aa604b458eb69f4e6ad7ec115042f824f47866aebb93e1997 +size 728947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9d6071eaeb..33a30de5b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67603d69898bbe1261ccd01f12095af5bb77ef86cdce1a9d3d8764ec30400b80 -size 1392853 +oid sha256:9b1a3da787232d8271489f40e419d099a9787df954c56ea545f0321c60e99940 +size 981069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 521eaa1bce..667388b133 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0245609e9b6d6250226abe65b6930440b9116fe7c22cb8aae3348ef4b48b6ebc -size 1242675 +oid sha256:d7bfa15f846a30170656941bc423fe22546fb1efdded940879a6d6e8da4eb0a2 +size 872675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b3a4f3669a..e3dd96bba8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:878fb89a9907155d7575d1d7d8dce697ef94a8f91bfcde05fcaaba8d1c59a112 -size 859719 +oid sha256:300135c2368c9bbedd65abcc32ecdc9bcdf479d8aab4b1f390845b320ea6bc72 +size 764801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 531798e50c..688f0cf3ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4b10abdaadc00ed727ba2c1d6694cacf18ab7d0cc6dc4306163c08bc411ddc5 -size 660355 +oid sha256:b16cde8d65d14da554e14cce237c6fd0e58ac6c6414707a8e8bd43220e1a3b96 +size 661933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 64b8f3b5ff..25521342bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ad53ba572df85402c293d3edfb27009f131aa9796e54fcc05038d6a7bc16f69 -size 792525 +oid sha256:02586f52a28ea84f642cd7710b2b81f87dcbe3c340e2f373e8b7110110f2c203 +size 720005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d88dfa81c0..f287ee9182 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5908d6de1227e61f4cafe7d23728664ba62bf82eab32e50d184a368909b8656 -size 616888 +oid sha256:1a2be7db45df0c2861c012906a7c6cc5ab5721e23329a18a2e68258d585f291d +size 620245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5112f15acc..9940999607 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51b03daa53af34a09a9d562073af93104c45b24af3f2d0cf47c4dfc592b0fd6c -size 782269 +oid sha256:ede864dafe8acb7212f0d80388503c55d93cda9fed02478e815bb0e2d8c3bbc8 +size 782517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index acadd49eb8..d9d73f862b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2acdd3b1220b857b932c05a38cccc17622e9a3b9b40669d76885d3a2849eae38 -size 738805 +oid sha256:050feec1c61b2e6a5f0026033d817585895efc3e6c0b124e5353650f44c87831 +size 739447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8bf704ce74..acd7f55f28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cce60af525e418345e2c2d21241363641b847e79118c9ca690ade9cec61251e -size 809051 +oid sha256:08014f074f3845accfc78583d0ee7dbf41488f0ac38d35c0e90263c17648d3b3 +size 786703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 980888b316..bdc327f7f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3886eb0066775caefdd87ec8f192dd53f61e40b3230295ae55c999d1f4fe13ed -size 767363 +oid sha256:e996c2d5c2c4ac6e878759c063e99ee29ad63e68542bbd6d11be0149ea5e66fb +size 744521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5fd018696b..9aaebdc05d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88733743d3085dd7afc57fdd3a3261beee00041db49225e781ce0d1acd911d3a -size 1610349 +oid sha256:20c2979d25366b1142908514b0c73eabf0bbcd24ce9cd3c18c921315cc0eb848 +size 1056829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ca0bc19ae5..4a7737cc21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6239cec874d56176521c673f83feb4bee9192ea9a6dda27a96f2fc424e237fe0 -size 1472997 +oid sha256:62b6dbd61430bf797d0c9ea8a6f12f6154481b6f8fd8e0892148eb3b99505375 +size 1014641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9331118a37..621b34809a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f3d2afbefdaf2f2efee94af3fe5ba387ae5cd0e32a0ad6d8243b6a1ddab361d -size 891621 +oid sha256:5120fb139e428f799d25583aa8f6671e9f645b6b0f29878f07c8117df744686e +size 773173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1b6d201ad5..3b9ff3f0cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c716a5f2279a7e05937558caaa63c50e45911a62c0f4c99d9526340917a45af1 -size 676717 +oid sha256:3e99d07f64e8bf026eb0bcff3948766faaf7d7d531b0ecca34d30a0dbd4c1587 +size 681849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f8a695133c..9b61e97ec9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2461990093f5630a483131c4f25e989438e6ea54cc077747c3dee1be1afbca45 -size 823589 +oid sha256:ced1f6da10062e83d0d684a2cb47b4d123b4bd672f0111f904f3a4422c41ae9b +size 722111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5bd464ca3c..e007635201 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4fd6f8a03792ca1b726eeed943e07d1b33184cd4b5feecc8290c2e6f20ac364 -size 630935 +oid sha256:8f536faabb34763d8dc6093b2d038d6812574028ccc5c9563b28781666116666 +size 636065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 62d05594dd..5e8c123e69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5af488cfb0cc316cb82f65308e1c600e358e2c13893f077cc8b2e834f9073e26 -size 775117 +oid sha256:c232cdde366ed290aeb08f353a6ae5d45adfce8cef88f38a47bb8c48274c513f +size 772453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 14872edd85..4051650409 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b947bc1bc3f0fd909b0548d035263772f9bee2d67ae78eb49e1e9d46c70342d1 -size 730863 +oid sha256:dbcc50dd75a5f1d2b6da62d3e82e101cdabc7e950840cefc25ab2f33dfc96f05 +size 729333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bbf120fba1..9701b8434e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37ce454f3b6ed2749de26e9de96934a23152b50f98305841f5fa078589b21fb7 -size 801899 +oid sha256:ba7770e629eff581c3d3e4af664383eb7359f8262aafbb04509c5c7604390b74 +size 776639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0b5318e837..78e7b9c077 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6b54cba792da9c196f5a46b0ee291deba41bb3a863e566ac5836979f1a168e5 -size 760209 +oid sha256:996c497d3cb0cb871023cc92cf6edc23fe1cf47b0dbfb19e6ea913df13a65851 +size 734409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c828f3cb04..7ccfc73bdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99aaa581085212b8376e8a51d17201886a146f29b61515caff10459d04ae5743 -size 1604775 +oid sha256:6f9b092773c9e4c02fd5e685c47c55d425e388fac656894860c07d9b4e6250b6 +size 1047111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index f940c5373f..412a44648a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:482a2ffc24b050cfb91500f654d48f353e8bd06fbeb57b698d6092e6f22088ae -size 1465843 +oid sha256:ed521adf2ae7a9406a1671690fce301f0ee9399f74b10454e0de4bb32e74c04e +size 1004627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4ee47071e2..8bd7e99b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05937a146ef92a4e5facfdfecdefd3397e6b2e51f31f627e44b14d671aa2276a -size 885997 +oid sha256:f90e30ff7d587f7afa268df7fdf741a3a07522630be7ba5985dd061638a5f67e +size 763453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e148bdeab8..01094e198c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a14703879692e49a11c782ff197f35758a3fb378885301c7a7573f13f1851ce9 -size 669565 +oid sha256:1999e5fbc689343a6633b29213db98316a0af974b421776604500f0db5f0cbc9 +size 671785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b10649d259..e771b7bf37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0d703b03c42d0acb85cf4264f39f5b8a330d7547dd3ebef913b3a5c50d0bb65 -size 817225 +oid sha256:bfadfcb3c14753cf83d9a42f309777e20a624e3ceeae40c4dcea3a8958d870d0 +size 712391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d00500b278..0cf9ccfed6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cd77420459eb77b645624dd23bc465788d3719dfcc8322bc62e09baa427401c -size 623831 +oid sha256:20932f83f473bdb08a844159ed74d91050e289be57bf6e7078a9d16712c541b7 +size 626889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a491c5132d..7393b0ec62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59505325025ea0a25bbee53964f28ee3dda87985587343f18f68f79ccd6a7196 -size 1340971 +oid sha256:e46d1452683510bad92ac66ef8cd2bbf2fc0b07aac36c3a2871e45b49adc795f +size 1005357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a4071d652a..d8e675702b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10e0ff5b1a17d707f094a90e267c20b0112c548d344b34dac1c1159251dd5345 -size 1220837 +oid sha256:05107d8f25de226b20250139e3df0c6b2d38694771b76878f40a244f1cdd63f8 +size 904117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9a1ef43afc..7fef14c0ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37777a718bbf85a8858334b7b630f0f3d7afbb7488ddf2a71cad7876f1b838d0 -size 1335887 +oid sha256:8b605033618f423dbb00f34dd6aa510b5b043d6cb5fd82704bb53f147ef3ace7 +size 1004023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 63aab05f4b..a1368cb463 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03cd87d98c869f0248d4178e2d0525a9fd70559869111be64027ab3e8817757f -size 1289063 +oid sha256:c8fa3cada56a038f1c48f216c14097f38e2a82dd1c6a6f11ff081bb3dd1de335 +size 926363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eb005e27c2..9d9c771771 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:773c08c580d6f3887d43a7289cc3bc1d63ccf5fa6f0a8c4898ab0578b50f6b2f -size 1557201 +oid sha256:93db32949f94b72902c8c167618c860d4119c68b5fc1c90bb0cb3de986502694 +size 1062783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 79e358a4a4..f7f8e882c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afcc2c4df2da2813c780feba327e002c2c3ad50b6726b56fe73e7a2b12136c04 -size 1317433 +oid sha256:e1b9fcfff5b3b4843cc8638a21805db11c3f3ec1162a2b757c430339bf590911 +size 1017979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d0ffacb07b..dc43672a80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a299bd224b4504feb3986aaeea95efaa6aeaf8f881ac4100cd71867a529b162 -size 1536431 +oid sha256:5a40e4e20e4e06508c0af822010670d3020628f5166fb3cbd8439094a6c6f7b9 +size 1033775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1475957a0a..8714d3028d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a20cfccb37ed5a7cde17ee06fc462b382461f68b50cd23c2d930b674d73204de -size 1295973 +oid sha256:d34134f61d0fc4db576376a95c57eabdc22d1093e441992f5dc1b131702b77e5 +size 989515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f08474751b..0bab1035aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b923b8d2de74eabeb5809114f424b74de295918fe814a43e767298568b23ca2a -size 1622861 +oid sha256:82d7d237e4fb343ffd0e9c2c420e389ba65f18f40022fdcdfd26d41706f87256 +size 1137373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7bd4527f40..37a4b51da1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20576792ea1c663cddf6d951c627038fde3792469ed0ef0b169bd6267537fa62 -size 1382501 +oid sha256:6536e56f2bc3d8d460cb94155b301e2f2ae2756a61e3ad118bb079ecdb4059a9 +size 1033813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fa173081e7..7d40d5b59d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cd350d3f43b67b99fe4335a96fc0a5305e5bb1a63c46b56a49db21bc1490e15 -size 1602141 +oid sha256:2232f3129971828bb57bba89c5fad617907350c9aa9670241270597fe83652d7 +size 1102345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6d44351712..4ec68cd001 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c173872685e962295243935a17288f63876de2c013aaf4768434138f1091bea -size 1361929 +oid sha256:f941437900bd8859fcd723d7bd4c544192e767624471455df2842bb41b2a2f7b +size 1000957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fcaa820b3b..461e70549a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5909e630db226e1c5e33e2ee731ec2beaa1617128138301d5c57877f28a62284 -size 1341765 +oid sha256:381b3b4e5e3d0d75015c57ee95c14e111c14cb6bbe8165c9576c2f7e0c560bcf +size 1005361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9fde32bf13..aac1889759 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5a3ba309ce4c89ca2b9b18b9718f0ee2cf1505cade8f9e76b42c6507f66ebce -size 1221629 +oid sha256:41808f664f5d6e43fd64b4bbc50a7742e5064b183cc88533452adf39c0d47c8d +size 904121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0eb0c1a2ee..a0b73d2466 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60a4d90c2a919ed982c92da6c2250ec3a6688c2b2d1556cb7300c0e3e7e15b03 -size 1336681 +oid sha256:c278223df6894cf4c0604703a57729a55c4fcf0b3b00f19285da2f8d208d1676 +size 1004815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 2a62a1637b..0c71f0f509 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01e902a7b25c513e1ec825f4968b1c1c63a2889662a0ee799f7e10032c73055a -size 1289855 +oid sha256:47fb37ff46180b09c688b70a772034360ca39cb78cc2ead0cee5a51fa751816d +size 926367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7ca92f00ea..280243ea94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73002acc1f8fd70399e8c6ebbfd7f7f258e6df0ebe331fbfdee7f3cc141804b2 -size 1134267 +oid sha256:4a9ac023875059509c272d2a9cf28c5dfcbb4699446ddb8738a8e997568ba75b +size 1181725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 985fb3ee05..dbfc8065c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65e1874983c93435329e8927425e4b7cbef9d40aedc5fdfc8f202682d57ed9cc -size 1041321 +oid sha256:5542d490cc21174975e1a3e65c716e89a5637e2aeaf80411d1ff9bc9640a1b13 +size 1066827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index dfeb2c8934..16a8821738 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d151ddb1c42ba959ab2e73eb79238c44689ea035698ef6b4250dbcc8280e3ba -size 955533 +oid sha256:f0ea9754ba156dac742d3d603453ae845363ab3ccab3973f78ac5ce8b50e489e +size 978127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e1da7a2abd..c62ce3a4b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36a0e30a79862d1d58a69940c236a3136412c0445d8c209487e3037ff12e56c5 -size 1069343 +oid sha256:04dfb2e37602e2c4f54372fd47bc3d9d0137bcacb7bdf355963681f23ef8e0dd +size 1116259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 90f4d76f1e..e447b887cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:296eb314ed0e849bcbfc6aa96c74f4645d6e6f6f5982693a8438e90aece3534a -size 983747 +oid sha256:49b7a117b3c17f7d7e5015cfb458585e0811830263ef4c723f03e6dbe564ce7f +size 1009597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ad3cacd6e..82163ce4a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e931825dc18cafc8ad10fa51a282440d1b882eeb0ed964d71d633c3ddd30390 -size 1151133 +oid sha256:0a5c05dd412977da86fcc6af9487fffda2785cb6d3ad96f3b042c48cd053f44b +size 1189219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a1619a4f3e..4f06079f2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90c490a4f710ad9e9516defda4620abda4e15c9cb678d5672f8c6b007e126394 -size 1059765 +oid sha256:051bb5aefa7f73547bbc9491c0ee593627124b7bd99125453b9b1f207c7f0093 +size 1075897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e500228a61..c390678c4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d7cd9e364d1d1e359e1f12c972f7b777180f83f2726dd68694c8a9c29638f31 -size 915715 +oid sha256:8141c0fb18ffe1765828e19a1e9025839de362c477b6969d8cd8427479b83005 +size 937963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 808808520e..7a5a792307 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88d7c150998dd6a88494b99093b421ab6b5faba94ac97875191188f5cbdaec02 -size 1089859 +oid sha256:72f6c115e64ba97e4a3f94b2466d4739c814cf1f2e30c8b928cd9ee709ea55b6 +size 1129819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ec447d3a61..85564d526d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f91f47e1c1177d1cf377a4114365a5bf515b1e8b488f7a5e7cef1c1e2a7084d -size 1005201 +oid sha256:298184cebe5038c9236314e83f23ed10d0af1c6695c8c875c5815d4c9ac2132d +size 1022419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8aff643a1d..7b4084ab1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af817843f8f41e21b5e5e0c4eb520f524668e18ee4049fda6446f71d21fb52e0 -size 1240165 +oid sha256:e25a93ae066a6d4dd2c5633ea8149b743eca911c21f9baf163c601f3c178046c +size 1170951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c598f9b32..c3beb4b748 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06723cc445181825f921b043b6fdd6cc9e29b6c0fd5dbd837bf249c0573c396f -size 1032661 +oid sha256:48d2362eb6b7b3aea9b3e2af7c8a6ad468593134bf93c1fae1f19c06bf269346 +size 1077013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 80d12fc279..7929c7b67e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7129ce8e615a5df2ffbf1e60fad641edb7274f2290c9eb26b7bc7300baecefb3 -size 1155261 +oid sha256:cde53c0aae30ab8b2523afe4a76fb8fd0112d85531d3f628cd9dd0c228f867fe +size 1066165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 7b385fde19..fa6dc17cc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95442d5976aa2dec5c6423387576aa2c1ea577417c609861c5c4472047222bfa -size 939765 +oid sha256:a60524a04604d7520c38f63b608fd74608aa123c3caaeb3a03f0f3049fd29c3b +size 963889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 87ecf9aa28..e38928e525 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73775ee0ad6340c5e38611d97485336cb6b2ce05996845f52c01c9d19b890704 -size 958473 +oid sha256:0f06c42e692a2fb80f67068181e4581b13785283e75376f8f71f3ab66b5a9815 +size 882795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index eb07756f23..be3701be99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:360d5b7832b3b3c5fa679a0046c1ee99d5cddb7a51f0d8bbe6824abe59f0c502 -size 811105 +oid sha256:6b8ebc8c043c1f36cc89943f777e4812cb8204c1f8355f6471bd33c2ca43bcd3 +size 828521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c4d8fb5bc1..d330241942 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebd650b37ea1b224fc19fb4ccf8b80dc9c969673716084d006276fcf1fa2afcc -size 1177017 +oid sha256:43f2fde32c61c8363a86f89d70327cb3b9813aa9652dccc823c53f6ba3544c4d +size 1114167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7e647a2084..f24884724a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a73e60b903e365aa3fa98f4f9acda195849dd4ade9be85068ba242c46b79bda9 -size 981599 +oid sha256:45cc9833669122ffc2cc82909cee93093f30b11d6b8a96880dc87b9edb4cfa6b +size 1054859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 5754bed845..27bd0277a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4588fda50a51106eb31d0ae1ada3c7c851b313acc61ba282fbec09d1f79cd37b -size 1103509 +oid sha256:84d5b0e5e5ff6e64d718acd8bd7ff56e7388015370090b48d371c7ccff503435 +size 1016089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ed260a7205..7691842e70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:654563c0e19fd162b44e7f7d7ba4a45af797075a6863d39185b6e1cdd9a21361 -size 896103 +oid sha256:f24bc3eab3146ea64b7427b80dc3c6925386cf710803cb8b25b0504618154d67 +size 916823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bc43796ee3..98d7b15d45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55875686700d4fa56ffd7a6a15ef2cf405514391d8ef8b6b3e543df53478b926 -size 1087301 +oid sha256:048e9fe7bc2192307545d3e0e061ce58ecc4220d0ff27721a98c5ed0795b3399 +size 1108269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 79a2a49251..be05922642 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3576907344f35496840d75959a75d6819be0a23ae5f558472f8bf1736b8b0543 -size 1019023 +oid sha256:d42b690baac871b720ccdf18c7d56c566331507a683d526d1fb51affd1387e5d +size 1032639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a16f000d21..48cba27813 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b03ad120ba00aa0c2bc3fc7a91bdef40342b92e07f03df6e18459be5eca168b -size 909405 +oid sha256:cb557cae9818d7cb7495f5546d4104635bfb84307aff36171c462cf0268e233b +size 886169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c668267542..48b2188f7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aac1f910ca251880b164d7d5eef927789aa60345b4a4fcb9ac4167df0d26dfe0 -size 1022229 +oid sha256:444858a7864d9b7f8108cfb137db0f0b5392d8eb9ab983e57c1690cfb6c3d632 +size 1042801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 4ec5dcb4d1..e4da174ce3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68055635ff547e0ca032143a2fff447a1fe02c49444ec5f5732422d86798343e -size 961547 +oid sha256:f945aab2e9bf220c9cd922436bbf1b42cb06777c1401eca37bbd41fcaa54aa04 +size 975459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 332bc78036..b0a18d46ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bf2d448c6ab56b9851aee30e07679070f857e607e1c3eac01a1544beed6568c -size 1104957 +oid sha256:772ec7f0ffc98b0d10b278006cef9b1d0b68d211bf9538c3bf152b9db893adee +size 1117339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3bdaabcf06..54398c6cd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3505355e75ada1e28c6a4fb8dd62f8097816f4a7062ffd2f7cb8fe173d60e9d9 -size 1037467 +oid sha256:d21b74cea5195a216e9ec0b7d1606958cea5ca461a86e18df33b4e0748f929fb +size 1040921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f8c444cd5c..3623ad1d80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df48641894cbaa01124a07379ff3ffe8e2b70b161571a0f56ab8ce3f48541bb1 -size 868749 +oid sha256:9b6ea76f6d90a457def4005270f065116bd36b1f9fdc110c11449fd65c75e330 +size 846795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbf64af635..767a975e4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e95e1d35a14d10be5d017efdf2ad43e95b48256156ce7c26e861306e6ee9805 -size 1042893 +oid sha256:e0278efee866a850f8b602b82ec1a204e1cb59804825ddeeb43b6513e12cce4d +size 1056361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 3b41d598cf..6257317a83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c2caae417dafa7517e462601d743e23a49aa0c51d1a3fdee599485f564e669d -size 982113 +oid sha256:476a93ceea5d8b7af91038a88578d1e886955355c0bea9e70236355a60992950 +size 988231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 48ec4c5a25..fe1623fa24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b566a6e6de476024cf61ed06c0a2988291f9c85cf05adbe1fac2b7c73f14a710 -size 1198873 +oid sha256:6294a3a6762d9fcc27acdaf16da97141ff517c7b8a6fd0a45d93818464f47df6 +size 1088515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d7d2a63a3..7bdb75a5e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d5653caf505f3e49a72f4b6c855a2dd99e9d006686daaa4d2927c35c078b56f -size 985597 +oid sha256:6ee630847ce05154a377db4cb389182466a2ba932b6c79edd623c79940d7b32f +size 1005429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 2d54c92d5a..489a9170c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac2b27442376ed2093b2462cd5b79b28ef2ff44f39a528e705f0be30bce06aad -size 1132173 +oid sha256:03ba4b7508e97769cffc1cef815abe71162da22aca9b85fab4f68cabddbfdc85 +size 1035973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4caefbf8d1..643cb8d022 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7e6d355e495210fedf39023707d981ea18eeacef3defafb23276a9ea3bacfc8 -size 918305 +oid sha256:42cf94e1b659d688ed6ab3026ceae70a0846d32cf07d2c8d6c87f3b09fa55478 +size 929651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 9647da712c..634cf833f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc0c022dad7d164fcdcdc5071c2ba70851973a8ce915ee427fc840c8b31cf673 -size 917181 +oid sha256:ed5fcfd2bb0b28028a8ed7c88070075c14084a063abce83794d3e83469fec080 +size 799421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cb76939020..d3ad0e1a8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6c26c5c7b6349dbf6cd6634019c49206f58e8f5820d695961d0fc8a49f0495d -size 764239 +oid sha256:5ae4b381ca2e749b7f2c74de4c58af4de5c967064b7121e14e046b08c23051c5 +size 738289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 12d1c01be5..17a4d2b140 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdf20117c4e403d415c3a4fd9abe3206efa404cdfa498c8e853a630fd879c24e -size 1135725 +oid sha256:f6aabe837922fb91af6282441f155734312516cc3ab2fefea80dca966a51da17 +size 1030891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0747d23319..ac554d797d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d80c5b17360f103a47ef9fc94b2ed3b987447429effc891cfaddbaa2fdcad9e -size 934535 +oid sha256:54f1d9d79fbf2754b7b1ac1233ee1a55ab7e1cece8f2fd11876ed722dd3dca29 +size 967589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 53e9c72452..0516f89d3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b3c10f4263b99053187efe5fcea4cd83f11479b27d0260cb7cb45b996f0f1c5 -size 1080617 +oid sha256:29b0ad01235646df4b86f93d790124e92b48e06be80a03e5cd9a68174ee441b0 +size 985897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 6d44b0e4b4..93165fb3ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff62512879c52ba67b05a310138e439fcd2d12b988327514d182ad6bb2dcb0c3 -size 874643 +oid sha256:8984fdf05b19cbb4662a72cc02ce23d9e6651fed544f754cbd77e13dd9e31e81 +size 882585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a7ea15e3e9..f34a9e465f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76597ecc7f8ceb38552be35fe14f2a0616b4c87743ed975f5ed5a01b9f05f1c0 -size 1225583 +oid sha256:fc7f51f067a92a5648850cc91f5377fb1b5baabb300e7269e0ef3857da346075 +size 1293269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8c2b2cfefe..af334be7f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dfa25b9c9d5324e0992d7098da5e9cd264229478944b89a1855d76bb9c522c3 -size 1120895 +oid sha256:5ff65119deea3d1cb6daea007a86a7037b0e729bc8096cad9039d4449fd9e808 +size 1137323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index b498a4be14..5f23baf43f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0864eb6ac9211e1c5a6d846940ea2e317e69721ab356292131329b6d4e4ad05 -size 1148335 +oid sha256:40462de2c3f74914d1370ab822efce650b07730314dbd7f6b3ddfe994d9eca2c +size 1161409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 690f3e6f68..4839ee5dcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb463ad27b15511e0d5d1daaf953ee4b8920091f2424c4e6e1659222430a6fc -size 1049265 +oid sha256:f974ed454ba11699212ebf5a49f199169d1dff21e0fb4673f1d551822fa55da5 +size 1088979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2c0eaa31c3..5766ceb805 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25717d651b8fa6664363e6cafb59a8e0a153384f8a4e81d4b9b4663cb501f6d3 -size 1158291 +oid sha256:3a6b7cfdbdf689f0fd116aa67c395e6a905fe5cb853a0669d13d6fb85fb5f5ae +size 1232981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 9435324a67..7c63e92fa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efc487d0a602e7c24dc92454e1116c5b2a228f6dc7654e240b8a7a86f0748155 -size 1060313 +oid sha256:f0ec0429a75324d70972d99dc07ac5710168fa1947c84f55e34441a703202454 +size 1083745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d17e25abdb..4e399e4278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8253c84b90d643e8b95ed4b402632c620ef4a2d15b760ec9bb8b57e720890a45 -size 1232879 +oid sha256:abbb156b15413b7baba15525437101acb7ffb295839ffb1ef7c21dfb07966513 +size 1296667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 640f8e7586..4aa59e2d0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6e35bdbbfb9b68bc8e49217e8559b20b3f014fa6d9202afb839fac3f6b1d9d5 -size 1128241 +oid sha256:5cd2b2707d100b77cec52d15085e5c6c8db87f587ba1381640a4a56aa0811336 +size 1139981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 53812f7249..189b961ad3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dbb86513c97a4bcf15cf35b4cbcf78454bbb4a2bd110c6fc4a209f1e5196275 -size 1057901 +oid sha256:25999e5bc4eb29799b686d904a4029a751745ab2f4fcbc7a9c18873b3922ad27 +size 1069099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8163aecbe0..48fbe3e5f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6080b22cf9378131a863d3611cffa9107d5797ac17bf4ba1185ef3e3737c7162 -size 989517 +oid sha256:86db06814cf92ed089523240dfb048550c0a27583c3d043b08cc66e2259d4d8d +size 1028047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1a7c56a6b8..75f42f63b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26af9b73c6ebf6ddefda6c0af2f02cf102ffe88db4231e025dc180278eeef0c7 -size 1178017 +oid sha256:0f19d5036fccdfd94637b75d905d1368f75b89a5a91002b85a90a494f1d658d3 +size 1242743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 5390c03ac5..643a67d2f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0ed8c789a09a9f3c1f4b0a95fda3561e6d87731ca9e7959bbf5a1e7f207fbaa -size 1079299 +oid sha256:98da6a8abb76425b72d9c3551a5c824873462a9bacb60f52d077ba18497d71e5 +size 1093507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8427de3b93..71be773042 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cadee35a2be9493209c8715c1a38bca856c71bfb04fdd547b4d1cd6195502d8 -size 1399167 +oid sha256:680694fd9d80d7c18dcd777095a62a8854ce9ccf336448ffc936450323bb4c74 +size 1274107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b8a953e32a..fa22af7502 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bd3fff8eaffc2a2f727a92a43e3d8c992875cfd54148960b8e386a98c2fdf57 -size 1114653 +oid sha256:8bb4e68a1176d172c6163344ef31aa4c252502e360ee4eaa9ab55e97e9205ed0 +size 1183325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 31ad84b012..ceed22dabc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e3bb04cd9c37eabc0da5f7dc5ba0f96152db087a4726059584d4d04d7a8345e -size 1304987 +oid sha256:9134f59eebb7c226614a1bde47ed0826efdc85ff5baf504a16ee35e9e3633631 +size 1134047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 786863072f..8a867114c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:785aad8db11e448ac4bfa01281c6635060ba9d2b2e4a9f7450a259188c4b78d0 -size 1009769 +oid sha256:8a60aef1370bd9a7eb029ce79c18de103f30dc66ef28efb275b20789f4e33d54 +size 1029551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 498a8afa3f..aa4a6a5d15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79ff883fc29cae40361319dc561d7170eead28d664f76bdccc86d53ae9feddcb -size 1050685 +oid sha256:7a9a01b36045d6150d41332bbf356e83a31076aa4180ff2505aeecf94ea35195 +size 1074413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 267106a74a..d6707c8404 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52987f260d9327b669d2a08774479c876355a2eb2a32c4f688e394527996ba3f -size 1041945 +oid sha256:73ef3d00af84dc282a49d29ff78d62c448f21dedaae172e10579d1c4638dac5b +size 974605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index c4b33c5c98..d332888d24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39ca16c0348a8ab81c9f9cefbb2ab3e3190561859c989b2d9f6e3ffdfb4e6bc6 -size 952453 +oid sha256:49ab0dd6455466c3e3c6764715db41ae7a2af1e465ee5431c8daf669eefe75c8 +size 960397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8051e5bfd5..f3b4a92583 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:478c6efda500f2fef3f28ffc5b553492c5e37bf614e413efb1fc1e4c8d2e1063 -size 884661 +oid sha256:5f9cec23045af63e05c3c61eb62df8447646db8516dcb96ffeb19500dd9f2592 +size 919491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1fb42ea8b9..0393da7ccd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8555120d33641d82f024e0afaa2e0f23274de5c9852b135b3f7f8268cc1f12eb -size 1315939 +oid sha256:cd121aae5c0aeb3ad45d4960f6d63d1f6b1a3b2e2cf0cbc3c9510b7a25fb960b +size 1212735 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d86f5e7725..7667190483 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bb41f086deaf8858f94e24df4501b70b299f7af7f5801454a4a6fb6b1794d0a -size 1058905 +oid sha256:1f6765f1afcdc1e2f749c8b6a229b92983d69382dc12c855beafb7c201b3c64d +size 1158263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 04b07eda3f..69f35d5c1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a1dc4f27a8ecd84cbb12f7bfa216640c65014f11d5b9e492add658a21fea947 -size 1242481 +oid sha256:0e5fc8523c12eb27401c683aa1f69e1e36012a49f6f833a9cf39e03cd385c45a +size 1079335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ca506c7ce8..61023d7e2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a8bfe684d8eeebea940fadc84a059e9a6eb3b195dfc18f510735f55d7a8deb5 -size 962111 +oid sha256:e0fdd00a14fe8a1a1f0f522bb8206397650cddfe529a97152fefe79eb35bf2a5 +size 978489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 774f6640b6..3f8cf49668 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0574523c2f23500b8070867c67a8a1a1234203ebc516dba1dc13040c8f7ae6f9 -size 1169985 +oid sha256:611a09931e3ab06d4a328681b963880faa0d8571ade0f0b279665ae445e8beb2 +size 1198401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d69701b6e6..ebb8d256ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:188ac0f29bb1e4c879d814764f4580c382fb6f5bec72170a82fcb691c6a0eb01 -size 1094699 +oid sha256:6990f6a7cafa72e1860a0a4a248296fb041ee95811e41fc4ffa736642b0fb83f +size 1098103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 750e218ec3..573d3c4225 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:832d730ec3823153cf4ececae54d42a113a1925621cf047e50f2ba0dde484db7 -size 1110595 +oid sha256:2ab41be1167ac249f96b11c4f00ec6cbf37367cf00eecc4a4ed63fb20a6a7514 +size 1102653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index dda771a2b9..1b653f3bcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d51a1b13371da656053a63faa2f860a4f507f36246d4962981a8538c20a45eb0 -size 993173 +oid sha256:357ea4115f901e49331638637b935014caeda74b40fa1c67b2989c3fe4cbc692 +size 978325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bfd9556f00..f6d6247d67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be9eef4dd94c6411a5574879df4a8c1cde99968a89ec0930d53c11f2cba52fa8 -size 1102691 +oid sha256:4002d6c493bc94e4027eb88d8e7e666fb7a7120defc65731276fe632e186d2ae +size 1137275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0a0527e5e7..eaa36cf76c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dd4e9593aefd920a7c9fd5ee13c9c1195260dba447f9924729b386130e42b21 -size 1034117 +oid sha256:850bad1921d99d0bf561427f43fa78cc636faab0392b334857f5b8867fe2c017 +size 1043737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a3bda14974..8e0f1827ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f98646ab7cac351ec7e9ae0df0497cd162ec3b6ccef21f82292a7fa078eab95 -size 1177329 +oid sha256:79f7cf50e952ba9fe0ab2b9f3de5ce273b7fa3caf93875b3c9893ce47a1fb91c +size 1201009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8368823558..ad1698a542 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfb5971a8a6f59e6823848c9a51a16a77ea85a3584bf6215bed0f7d21d044a43 -size 1102833 +oid sha256:9424c878c89f8fb42b2e6a4302f77039fce2b645f97905415ad95ebd8399ce82 +size 1099973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index ea6b02b131..43fd6022c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed3d46793f06e7b4b45944c43c780c6c8c00f7f3723ce4b364b460ff096cad27 -size 1019371 +oid sha256:86c6cd0c88863f4a5730c5448631a3b17e8a8d7d74de7632954de4ac408d2ee5 +size 1010343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a5612a2ab8..d1e880bd83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c784cc953aae09e0e0fe40b8918c8b0b5e44d8ba32453c87bcff35d612114362 -size 933425 +oid sha256:c647cd4e3a1c7de151ceb085e05395187c6bd78cac35c95c79d426418e01ee64 +size 917391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 202d8ae3b7..5a7ffa40ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae37fd0e2d7e82f12e5bb6e0e0493b58909f8d298c00471339108fe00729888f -size 1122419 +oid sha256:139a280e304cec724dc8ad4816b752598424581791d90b028504c775e633ce33 +size 1147037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index c92f39d574..f13382578b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68b798243a4672eccfad48b5a668d55da1e9f42f112c071c6b430fa17492df7d -size 1053103 +oid sha256:9b7e37e791bda42f7ec0cab4f45ec1dea428432a4d2a50b61affa38632bd2ee7 +size 1053499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index cadae1b753..029703917b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eabf1d8d1aae3b874ade89f20a6cfbcb8020bb4709a4c9b25cbbe3c43094d93a -size 1352349 +oid sha256:e0e012ac2628aa93cdd16f2154b2080126a7b26a64c752a08439d1c8add15bb9 +size 1173911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index de894f4fde..9ba3bf7fd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfe9236847f1017e83048855e268d5faefe5635acf2eeba3922610dc90405794 -size 1058365 +oid sha256:710428ae4a24bac1001ed60d316545aa5da18f7b6ed464b8a10cbc39d90b8cc4 +size 1091417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index f38ee54178..3d9b060651 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f61ad3a0119eb0a39dc70e1439b65a1863cdf3fe409104884e89814e4567a5ba -size 1277163 +oid sha256:12dea5adba53ab7b4388d75f2e343f8a6571bff9b22a7b1ef5b58504c9331bd9 +size 1095963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index a580a1c4ae..1b974a2024 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce14014805696c261dc1f2277dd30547b754aa45d86403d4b3a0e33ef4cb4c96 -size 984411 +oid sha256:20570989b5ee6f52464497d7343d2ca0920215abd9991eb419d2f94a1337b086 +size 989443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index fa14367a03..10e53f2ff7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6fa1505015cb29b8f687f686d1ceb80c802c1265e8945354bc13cd565f09273 -size 1012155 +oid sha256:a9842dfc87db8289ce76dc45389c04450ea0eb6d8291c03b2cd793f2f9e59afd +size 1014671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 42fcd6ca34..cd863736c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6216e316773ef88820010c0e637c17ab3fa7132fc6f022563a76a7a180d26a1a -size 994979 +oid sha256:de4fcc9b6155ad1dc62080444a350f65e05e22c9e0e8113318243bf184bbf3a8 +size 876333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 531c8e00b6..05dd03b2bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac4a84f47c79b53593fa937c20a9373a6c68bcb736d06e1fa0434fc8aee30553 -size 914713 +oid sha256:bd8d38860ccd9df97cf61aff73d3b44724d77b6f5d5eb3c41295aa7369ce5c24 +size 902479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f87e3b3aad..c21093a267 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcdb49dfcea364063ddfc39debeb2eca271168f0be805aa14b154780ec15e690 -size 828373 +oid sha256:007319fccce7692a4adc9164e0d626d945af97e5a98a2ebbf14edc2e5f78ec58 +size 809675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a4f0cdab49..b6137174f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad081b0756c1dbb3bf70d88a027ea71c19c0bcd4d94d537d4c123b96d277b9cc -size 1269073 +oid sha256:5d17fbc979a8243895056688ae72c68c382fe84dd1fff18322f190007373521a +size 1112539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 296de4979b..854a936859 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f982391e57466ce94ba5dc0e5d8a655f8378d0b1ebf3578d714bf72119f0d95 -size 1002615 +oid sha256:3c673e264252c093579c947f9476141653b9cdf9906f6a16b345fb7fc551fea5 +size 1051209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index f137e87e61..02c6e98b3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e953fd5a2bc6197a5ab4995700d0faaafd268b9164cdd8361887827181ded222 -size 1215593 +oid sha256:5889b00240bebeec3e85d3ce74e5a159665bfbd9b25fda35d32ffb0bec20a959 +size 1041299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index c37bfc073c..2d37f1ac89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:597aecfb5f0b6fdd10d33f190492df663529d87303f1713a80d14b76ac26d78e -size 936753 +oid sha256:24fa0329cb0c96ce79cbf9be30b66e4e6efd35863d00e2143b2b156000ae4924 +size 938381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 46c33d30ff..d38acd9bf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b05307a6c4069b85b06ae740aad6712a36b15354218485a564bff55da20ad5c8 -size 1371955 +oid sha256:2c750356e9057245976cf1926f28439e430bc47e52675f5352bc3f1355dce3a3 +size 1535249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index b7ac66efe4..f6e698f22a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a8045a9777d013f25447136e942c061ae977a8b6cc823ce7a8b04e5b97c6682 -size 1244969 +oid sha256:208ee3496182da8fd9f4c2043f97d62ead1374017a248907dee5dbf391f5617e +size 1278121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1db574a096..1b87c33d66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75a10c01afe19bbda9d887939d296b9955feae1512a5f7d1ed9f1c4872547f35 -size 1218331 +oid sha256:3018531aa36c4109016640b0b03c0a4d0c19b2a16146d45f55b879351453fe85 +size 1306293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6fe4ef3034..f836dec3bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2bcfc1efb30a53c7c0669800720e41bf539c654818ab2771f9ef32a631fc09c -size 1288727 +oid sha256:00c17ea7cce2b342a320e6506a91c3ef3f4c28f842ea2d731a571c1375496b9c +size 1460407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 94e269ed12..4f7b0a3bb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:371809ceb1aa00ee192f4fb8bb33ad8ce932ccf2adf8a85be56292414c578ed0 -size 1166971 +oid sha256:e2d94daa8c1d3747c79cc1143d57c1dd11ce9f531276f2cde422760d910d9c60 +size 1209941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index eef1839d52..6f3d3f9e8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3eccb4b9b813036138a1d443ff3baf3150d5b12752e82750ca2257a56be7bc4 -size 1355571 +oid sha256:d4197d10c1b061eb7a377810f17c60ee759bfacc480d3fffd9de7f78c0fb54b9 +size 1522169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 06a29297bd..30b01c9965 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7aefe33d83ca0795d30a08cc616f16c173ac125ff88bb95482ad92565ef36b69 -size 1228585 +oid sha256:59279f6aaa77aaa9307721c9665aca1fe199fb1e24c6b2cdfcb3406ea24b5cfe +size 1265831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7448056e13..64a6e7e156 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6840d2ec1d9fde5fac30af8f3dc55daa20bec57724f350792663b592b045a93a -size 1105253 +oid sha256:995ee02297ac01f1fb4e2af377677ca0f2517724a180a7fa42abffbd6b9112f2 +size 1193165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b8376101bf..7a17bdc2bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:237e8d381e649b8c0ceb3ba758bfd442ef8bb4ac4607d22e7f5c8f4fee60fba0 -size 1285417 +oid sha256:0151ef7b0b93e5920a8a642ab3704e88c3acec05a433db2ef69f1852003531e8 +size 1458971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index eb7de35727..a687eee2cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33e86b36369b303225bb59c41bc0c812ef86d20285c0b555fa24d1202feb91e2 -size 1163759 +oid sha256:a5cf429bac6c45e7abb0c5623bcd92dcaac58fa04e724f091510798925d301dc +size 1209343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 71028dda61..400167475c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3affa41c59d6551938f0ac022a92330cf3d094d87fab7b5cc647ced704221d18 -size 1573265 +oid sha256:401e33a725b6206c51482104ef3450c66cfcdcebd3fbff4d8c18235843ad237b +size 1456985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 26c3ab1ae0..a143c9fd08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b27fabb10a3f4604ac91710f25ee04eeb69683af2f3d67aa2b991e4ff312478 -size 1251801 +oid sha256:c971b9e91437b506acf77f0afaa96fdf7931d7c7da514c032b3cd034053c3da9 +size 1419829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 03d9575d95..297792fc3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:847eebd8ce1978a17554bfd69ff9a6f7d11c2af9e3960cef194a1df4ba5a8bc2 -size 1412485 +oid sha256:c75c4ac0dd45a53be36c655b6fda49b02c0a9b357170d82259665d90b19e9ebd +size 1266705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d93a280c9d..2d157ca948 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cbd4d45fd02efd0e71896cd9a91848492c2df42a517b0b0cbea723ffb504a9c -size 1122939 +oid sha256:e57ffe44a98fd47583ef232fd4b5ecd5b73cb0bf22455eb029adfedb9423421c +size 1161075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index de0c1cb6e6..115eeb2d2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e356ba3ef6fcf1eb4aa4d3083802e6700fc39167b2a61605288c76690bbdeb0 -size 1195321 +oid sha256:4d2e1ce75ec8082c2dabb37ed49b0522199a5785268013536cbe2cf0dbf0aa88 +size 1129955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4536558749..5e0bcb3c40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:765d690a751ecb65624419b231a4f93be7511a99d927fff2a690f09a4c5e5402 -size 1011547 +oid sha256:d26ebcc5a342e8aa53cf7d29409b790b2f376fcf8868fda25fc2ea3b78471fa8 +size 1094871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 84df9f368d..4746bfcd6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6deb629b1daaa351b7fec5806350bd4567d630b7f524498bfba57aa96326a43d -size 1512287 +oid sha256:a350f872c36dc34087ccaa0d97aed96ba5beba162f5214e94e160948267e5618 +size 1385795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1304ff2b2c..861bc93a07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e0e111ffcdff53c1e25487484bb84c8feb367611bc1b3d75d43dd11c20e0c0e -size 1187567 +oid sha256:909cfbde690f7ca9243826ab1e6fc3e1775422fd2c633bdaa9f4903fb0eecccb +size 1355743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 4bd57402d9..571ce4b654 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b11941c3b7494a44bb93bfcc664a35e5c75513501c7f9aee282e131126fa551a -size 1345093 +oid sha256:094ee04dc1b2fd5c3387b006012fd0e158a61d64d92546af75fa9bd836a8d512 +size 1202225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 065d9662cd..e8eec98fa1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a34ffd0af1fe2f31204a02c9d64fd28a5f71004d119cda9c8845d1871bbf28f -size 1065217 +oid sha256:2ad690957072498dce86eb3a3d1b681f56f83573944425eeeaec2da327fcbbaa +size 1102909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9c1b7c88a2..ccf9e9482b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1beed3e43c3a9441bf2175f23f75ca09c6cdfaf853cbab4bc528a4a3474fc845 -size 1310781 +oid sha256:daf25393291396bcbf3e0ee93e161c305f6a4b78dc8b32549bb0114e2a694a59 +size 1398249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 663d10fea5..e3ebbf5c07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef0ed05179e2270678c7ec5cfee42e793bee0badcb48e77a430b47413985ee00 -size 1213247 +oid sha256:c7c85d97810936a86a075a7410973fa5c2719ac8c6a9f8bfec722a37e044eed8 +size 1225483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2e34eb9bca..0758974a62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eb93040d371eb9c477b4b0148ae9baa6c6b900f2ab41aac9655aeccba02b2a4 -size 1157207 +oid sha256:8e338769fa68815b8845405a941f44f001c9452c0fe0a4a9df113a330385e56e +size 1165101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c8266fe279..2ea5b78abb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54335610e7fa64225f339cb5399384bc62ba1525ae7ec120d9694b6e63b65e2b -size 1227603 +oid sha256:09f8a669cd3da1718e5b0b7f2cc828ee1835d888158204b94223861b4eca4b22 +size 1323359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f24502d60a..03eeb04ea3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:598c4a45158babf1345a8574c09c43dc10b87fa3d915a954ff038709af281efa -size 1135249 +oid sha256:0635a3909c8a35ced2648dedcdd232115c0ac04545a31fea77a07feff14d9635 +size 1158091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5c202b6602..18de1e1dce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8024899103035ff6149c27156c63075ba79dfca688dfbf17f12ec5b42a6a053 -size 1294397 +oid sha256:706d75bc7433ced835c5893a50fd4d8d4f1a4213a951f36e7047f08488245a9a +size 1385171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index f78e57b574..95502472a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26110843720cc1f8fe43bf9b1861dd287968c5e6dc329344beb97f3ce78a6598 -size 1196863 +oid sha256:e200501d462b386fe08bc6f3c89bbe241e541270853527c2c999f5828a1d6e58 +size 1213981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1312235e99..a950844076 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96396915ddd68593f686de0dfec6964d28fb4fbeb6db549aaff081d336865451 -size 1044129 +oid sha256:8d4cc2d2491ad50d907100952c46eb1e5ce7a4d3c4757e63e0ae0c3807fc5d9f +size 1052763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1f5ef18a00..9ff3f2c86c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1e7abf83b8d5e1cae9018f5062b066046276c02ede3f6d893a41199c4305115 -size 1224293 +oid sha256:2faf87ea065a17dce8b25c01414947c4df8ebf779887304aa02a8f948b4cde2a +size 1321183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 318f52d7c5..594317735d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9c6301ba6e3fbe25260331745cb17bb587dce28d5edaddff30e5654ac01931c -size 1132037 +oid sha256:5aade4afc4a15413a0fd10153264e5f657ef89c68fa7dc9e14c80498202400a2 +size 1156703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1e1540c940..fa16ef8d9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecfbb854ede9434cb28d6fd466f26ddb08078dd46d81421eed4d0911ad4892f1 -size 1512141 +oid sha256:b173eb657b1b3f83be6e4b29d6cb181c28f1d23029c1033ea838ef748853a040 +size 1334097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ddd764a798..60cdbc08e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:660d9b4308d43d6f674e59dca5f05c00829c36ab110a8b63be0ef4287961a0d1 -size 1190479 +oid sha256:9e487fb05b18c7b061cbc8888286e9d01a42a8283b7f853925a26be52827b1ce +size 1282979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 802619623c..dc8656239a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53515255769ce2cbf7d6f25d9b97e1c4659ab2b8d969befb82f55e41bebdbf01 -size 1377655 +oid sha256:13283680e8bfe9d9829e28c42a81eae1960a7db2daf27f8a398bf591ce02893d +size 1218753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ad0337367e..771a4c45e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab4aa923f9907006cf1daa63179f55c933671c31f4323b5070d6ddaaaf2c6daf -size 1092007 +oid sha256:c383df1d4577db8567f684b18f981dfc8318c3e69d965b9aadf3b04c053a0ccc +size 1108287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index f0f8aaee12..be4436e6da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3778a6feec38cc01c7c37c2fe06f2aeab42ce7f14ff6eb1a931717841fe50ce1 -size 1134987 +oid sha256:10e0448f3b3f7f723534f8deabc227a5f7b1b28694a70dd729ed651b5fe60be6 +size 1009681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 92336f8f21..8090977953 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89236ae4f57e885ed40604f26ee52355ac9abdb275eacdf6967196bb61494994 -size 949585 +oid sha256:143ccdb5562b30247046fe67a0ac2e1430e3a4240d491cebe106ffcfad9b08da +size 957281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index aefa52a6c3..8537e38640 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb865a86a6c1d836812d4dde432239fae111c3cf0ccf226e0fef071f76a541bd -size 1451163 +oid sha256:f361ad494ba928c38c8f3cd8c26426ab07d51422190b2cbcb2e5aaa84b22c1d2 +size 1262955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 480f6afae1..171968455e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09e66a22f8745582a2891006d966655ac081ae95ff0061ba794c7cef02fb35cb -size 1126195 +oid sha256:29ca8b0a0004f7f021a43468363bcb1a8930f51ad63447df30f22f253dfa2d49 +size 1218103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index e315fbf4cb..bd2ee44665 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50fb2b285644574dc812f5cd15f0d336e4feb1e2497ff8408a912ace569ca690 -size 1310067 +oid sha256:08e7ca65240508c855ad1a24e160bb6ae56f9e67db8adf4f044554fd01af5ffc +size 1153483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 905433e5c2..221135e598 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:261549d09bfdd883822183f942ebfd39dc7504ab868f4f6fea1c490078fe5a44 -size 1034237 +oid sha256:62c87d276c1502105977ea256072a658b66f375d52878064b3489fcb403f38a9 +size 1050073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..61c49133b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b823fd97cebda204b281ca34847f6303be80699ed1a7bed94211d4428e2d342 +size 952351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7083375239 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d23c3d8f774cacdc1d16b965021c5ec90674d44ffab2c36c78b8fdcae711bb +size 836507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6fb196f1c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7decdfe5e8c56a6905ccc57d5c539f5a79756871175ae6795ef7a698dfb8c32a +size 952595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..11ed47f9b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18a4930c7462629f2b741821ce851b92eafc0385ab3303dd149e95c7ad9b6eb0 +size 843609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e74e6622f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e376fece91562a4919924e22f3c0901846d2badf9181ddbf04b34d84f3a758df +size 1078823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..47a98abb8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec3ecd136dbcc4e3b4dd2e6561f9a3144374927b6b79b94c2dbe9eb7f95be971 +size 971959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4210397c70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63447fc10fe67e506d7263bf18be0a3b5a7059eca181a683d445ca5f7d29000a +size 957335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a08fb5cb11 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069c6dfe0c312f1be5aeecc09d9512b45b4b79affd5919550793dc04a8c1a038 +size 914013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4ea771ece6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cf91ae781b506bf47c12f6ace5114827f6f03773c7168380ef615113e2c4c45 +size 948257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..52c67f4fcc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad16e5a9bb4c06d3bdbc7c42c69dbb2fd2ddbcb7d1f3576da40f9517a80c4b9a +size 903997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3d3ec0c2b1..4cd438e968 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3ccba4fec2368a10c042ec1639a78494182534f9473175c85e40c31ee00d23a -size 2005503 +oid sha256:75adb0b1e5abf73dfa6518a5a3b0283c87ba22318929379a196d452d586146ba +size 1120365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 26d6555854..8e90fbb33c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b24d60c26a27220f82ee73aa61d9a901209df84d0cda74c34bba7e4bb2b054bd -size 1981817 +oid sha256:a0ee54fe3059b2e2f7e293d1e57d7597576d27effe10ba8854cacbd65945e04c +size 1092435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 37f16aac82..be0aac2462 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33c17dafa4ee453e42767aa2d887007b90ce3ada7c264ea542b45b2101fdf6c2 -size 1270909 +oid sha256:fb2d6c761cbfd161d078d8a180ef768d0492289a3ac494d33f78f4879b1478e6 +size 966621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8264b6d9be..eaef03497a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbed5712a2e061aa7996475109bea660e467cdfbf14ee12772cc6eb107682747 -size 1150183 +oid sha256:99f85c1185e9919c89f6f4074491bb9c236148d9576a62820933300ba616eab0 +size 850433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4b11ceb0cd..c60e3b6bc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b008a7b6a58b57ba5c7446bf17f97d7b84a803620a98c7f6f838b04c93c2143 -size 1998349 +oid sha256:2865775bfa4d3b4c7e1876f78865b9f87f4be13939bf87f8d0125b57a31de448 +size 1110251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 279ba9db0e..a7e5b6dbc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67d9c77d5c9898e598050e88b2f5fd1f536204bc462c35599c6a2c5378fdc1c6 -size 1973875 +oid sha256:e34d879457ba1eecccb56f452cbab0d57c417eaf5bdeb3c2500a87e69ffe9e7a +size 1082371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7a5c38c912..d9cfbcce3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c91dfc240625361d2ae65d862fb83e818826de1ae9aacb200eb9189daf8950e6 -size 1264545 +oid sha256:58033f5df509d879534a6fdcd2125345e2cd94b0b2ab06c1563a16acf378d03a +size 956903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index b55e15d602..5dbc5cd131 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71a9dc841a8cfa53d5aed6861946944d6f7d353e9201674d88be736cd7d95efb -size 1143079 +oid sha256:f0ef2c377b73546e06b34bbd3869906d96dddb6352f06b06a44e00193506d25a +size 840369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2c72d5c57..bdc8e20a80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c28763174b29340970f14a17188ee12921b3a4caa6d4af53a7e88e61ad85e5fd -size 788889 +oid sha256:e96112d595e8e2e29ad463f3d355fb4fcf065e8cecfcb316197fd928d5fab70b +size 780701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 25bbb8002b..28682ff151 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:852f45ddb51cb88db7ba510e9725cb340a1e783b5ff40bb6cf03ab532ab3edcb -size 709511 +oid sha256:0918d5d84df7c1d630f73ced74b69e8ac73334992caace651624b8e395916220 +size 707241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2b7d24c751..e8f31e2972 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20ac10c0b521a6c445f84ccab2b1d4f008d5aa3918dc011ec12103288b7595b3 -size 788883 +oid sha256:5fbe3aa9aaa109baa24eaa56b6d7f2c3a30b44246a10a1e5b19a8007273bf67a +size 780103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d9c014b752..af55ef8a6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf99da4f48ca28d9a4a732ded38c9a779a973e4f7ab95e0b27feb7ec2bccbacf -size 733185 +oid sha256:d4dbffc5311a4bd79bd529f0c8e590039ba3d776bd9b91fbcd8360b753e01455 +size 728153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3d4589de87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8984dbe5a22e8539199d843abbbb51d0ace0811d997a15b1a8187b1626a755e3 +size 1026597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..475698c9aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07b6b4c0e140c540e295c77b8beec79083522ca97359a497dd9d1c8f168ebc2a +size 914503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b59c8c684f..128f445f32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31521557c291262076e0ef6d9a78ae5bd2ee3508784d202824421ea2c30477b9 -size 839091 +oid sha256:ff5775c2e8eaaf67f59bafd1c9f5d9a0d8ae5ef13fe70a75188b2ec6d1caa5a3 +size 786945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c3253e39f..20cce7f037 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77d971008b612b4d185d75ce1db4cf8c6702c6ab4625683071c8e25a25b8089b -size 667601 +oid sha256:33e0242db9aa29fe2c440518ee7d3841e7a282c20bf2a10d10f4d7b5e24c3fd2 +size 664591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e16e1ed23b..c4a68d6ac2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17fed632cf6481a703830758f23f4c77c050eb09f33fe634978425a61bf3e4e6 -size 785907 +oid sha256:d83839210d908f2087042514c3bc003d677e079194b465547622a0c9690db221 +size 717827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 697c33223f..77c374ff58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e99c3078892c91c5a6d86444923594d6c33f3870f0a6286cd9b63e458d1c24c1 -size 622359 +oid sha256:b1e4515c4919de30d5071956346340504c45723355b9381b03d6236e9c8edf57 +size 619943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f6e584e0cc..b6e4c0bcf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a53570fc8f4a96b5d8caae6ea101eef1f53727b10fbba5519857c1694927a4a4 -size 781737 +oid sha256:d4c710a2be7ef89ca688e0d1b9688c4f97988ca9623919ff7de48fe07069d55f +size 770587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7a8daa6616..7f79efa656 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b52bf0dbe3a5450b36668e725d1eff0558236127522b659dcfcc03a18922ebe -size 701567 +oid sha256:a1535f9581c5c81177da6d0b9244abce72522badcd70c8dd43ee66582b19e0aa +size 697177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3d0596060e..b9e10b7826 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc3427382202f8b4b19875f2707072162fd0a1ce579c29f9c72bdea9c742d8ad -size 781731 +oid sha256:9fe1bbaa14b851892983c5ce7479e90871a62a9c1d1a25c29087eca85c170409 +size 770827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b27d4a40e3..baa8363cb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:145151bdaa5ed1de788ac5b39824667709d5e104f73240497bbbc275974ad133 -size 726081 +oid sha256:dbdb663e1a27e457d3e944c5cdc01bcb457f59553e23bace8caf979a1e41add1 +size 717299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fda04eacb3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4310dc5cdcabd9c37cb816c2fa2a70b87283cff78d5b7a0d34e7a973801171f +size 1016829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fc1b3882cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f2ba80d6affc6ff6a17dd9be07d95ee5ae8b18aa1f703699b0ee07a1b8e789c +size 904341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2b95cbf57d..bc0eb43d9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e11a8ec2bb10abfbe9698cdc34a033f5dcff59467762d23ba507c7b4c450b2ea -size 832727 +oid sha256:ed5fdb0dd035d224f4d2a191f156ffc647513141907d17bfba57c12e5c965a20 +size 777227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c07b2a810a..6a80ba529d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2df2df041142f2b37072c2212126eed050548308aa0e26aa7e108553170cd7e5 -size 660447 +oid sha256:12490840566acd483ec69ddbed308a93ce46dff063743bcd496f895f3dbd1414 +size 654527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e419a33d3b..7d073acba2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ee3f941d8a1aef89bb610f6c30e1cce639d2d79773cd13d2411229560c47123 -size 780185 +oid sha256:45edab51c8a4ee84f52254494b02f9dd0a6c085d8ca334a0066ed44d8ed64923 +size 708011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2dc77dca0c..a4872b732e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb72a50f4c60c9e3d7428880b5a88521f5697e9430b6b4a9f8bfdda8504013e6 -size 615206 +oid sha256:88bc230dd78da4a7a35eba701f1a9e2f35462a6c2ba5ed3be237fffb913545fd +size 609088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b74fc66a11..c55d856172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ed62c32bad842b0aaf329ba1f22213a56ce4ef0ee1c88ad8b29fe2c077678bb -size 820793 +oid sha256:7c251ef8bd86159aba5dd2fa404943a4a22da35ca2a3f71dfa2d0ad205885437 +size 790107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d318aa42b5..1f017bbfa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:904ae01593a414627b9bbe99fd68ac2559b7e1ad3535efb0227f9d850a437352 -size 736381 +oid sha256:dfb1993a6081df35fe980b9b7079f06377ee447e547ce313f8627c6d030ef69e +size 727797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 21832805cd..dfaf95bd08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29f2161ef7c786749a4e5545f782c17ab546d334f0ecee8ec609c9d9fbe90f13 -size 820787 +oid sha256:bc2dc5c2792933c234e79d5d7f5662cca07ab767fea40a4cdde179f0f5ff6bf5 +size 789559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ed6bc6fdda..0199928253 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2369b709cf0f52b0f105340d13784880f6a1ad1f4f702705dbabca8f64fb43d8 -size 762719 +oid sha256:a64e4cd82653dcc13043c7f9dd00570c398804ef6027967a29dbf312ba692114 +size 733663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..36e2b43f2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e5344941d7583842e539fb8066d069b10dbf2412c696f2059285da215a4c3af +size 1114345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fc2122e979 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f2c5d91148b49f99ed86178f587f0aaa6eaf421077cc4b43f46dd284a138502 +size 1044333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f9a0beaccc..f1d3e7aa3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b025b8c5597a95c1dd70f6885b317246acd6dbe0109ec37be053c4f29c123c64 -size 864087 +oid sha256:730f7eaabc5663ead5a45f20995e49b81c36eca16b54ddb9f5785f80eb77c647 +size 765421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06968f2aec..2b8c44e46c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1562e29bc47a07953384b879d6336dda75d3fcf06f3dfe0f87df51764e0b9a17 -size 686035 +oid sha256:ed2662cab92f903dda3ceadbb570e0bbeb3d3da5087975354b28a8ed98076c92 +size 673555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7fc66ab0e3..102138db44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8075b0557210a51c4c176bb9de2bffff3e79568fac7d70ad9105fcfee42bb6c5 -size 810509 +oid sha256:e7552bb3362a96d03454ac9c68e2890405cbc547c1138278380cef466315b155 +size 711449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e37e7f38f0..853afad212 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc1b5ee37ade32a122437a16a073823ed9dc7e689df8a8d13a8d7dec651bb185 -size 637737 +oid sha256:67d0f4f4ae47b0e73912cdd9414ca826d19e3cb703ce5e997cdad158db7791fb +size 626537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b4b06a1601..9af02be8e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c37900e6ae058092897b845bd2617b321a4d52ef22087083fb0c4d85f1314d3 -size 813639 +oid sha256:96dc5891ce8eeb79f6ca0e57102f0d44a4e63cc859d9bb589ef32daf7f3d5733 +size 780043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 545457050d..a85dcdd55f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:798b82ca5991f864d2009114ea44e2e4dff5222837486e86beda2d872dd71a2d -size 729229 +oid sha256:d1c1308a8ff4195474573be1715d002a68176ec51c37ecc82eaeeaaf07abc3aa +size 717685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aacbeb21d3..dd3b9f5116 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d52826a21278996be0de383fc75cf3e755a70f3e7f7d8dd6169f9e00518e2fd8 -size 813633 +oid sha256:4d073f597f87aacc6c39783ea75c6d418e7ec86a9b154da395913fc4f1a844ca +size 780285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2c71038b63..dc068a313e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88a74c9a66bd5f8729dd3da9a8e72b84ba8732db58bf04b2f1330b42e4533a6d -size 755567 +oid sha256:f4a0227bafac57c2cab8672dedadd0cd96733717d49b9005561176c931655776 +size 723549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6444d14716 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca857322a331f67e24fad316a22559058d615b1fcc7aae265f9d1da367207ccd +size 1104627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f4c1db407d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b88e56859ed452424e044c4c85925f61e1eaf0aed920412d37727626252a8270 +size 1034319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 39fce948b3..074e82e852 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a97c30133e796bdef56fb0cdc5e567b4b13770452c7940150b92a42e767368b0 -size 857723 +oid sha256:cce5bbafc99f6c6896de93d541795ecce9070618319b323f261553afff967e8d +size 756443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b301327ace..764d965bae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:666a46d1e3426aeeb40d4dc9b0cec35a951551b822dc435b3c8c3fd3d48662fa -size 678883 +oid sha256:c7003f650f6ffda032333c93bad501102066fa91e11dfb751f5e92552270860e +size 663491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8ab6225466..f44639ed1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a26608b7ef84fd053cd0ed6126f9445111f1d194faa6727ef7454696e596f8e -size 803949 +oid sha256:ad2d03c0bb4947bbf987b57004e9222ccdaec286a3f19cf0f9125ff159e46db6 +size 701729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 48e8260cb1..709d34385d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e75ce7c71c6bc6038771b96f54c813d43a62eb9a55d150fc0e0eb98fb977cf08 -size 630583 +oid sha256:641de51a75e3ace076fb325ae1e6a440837c72c094cec3949517f47397831542 +size 615634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1dcd4ef65e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4148faba39795c7ac50169a495d114dbecf7103739d14deb60be4c60ba037249 +size 745051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..150e475edc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f61ca20819024a7b3884b657233613698cc199b4157cd15fbf4a98c57257b08 +size 655799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b8f54f3374 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d90ec7033a107c3cc455fb059ce7d7fae5b2a2cb136d633ebb0d22dd40f6e57 +size 744409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4e8307c597 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5de937fa556f49a01eb65a9630f932fdf1bf2ab61186276579a0dedff782483 +size 660385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b66ede13d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01c0e17a6e7e7defc08691a19b690a6cd70b43f374a36084e811125cb2a78958 +size 813213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..83e485af43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28ad14ae8305fcbac4efb799d5988ef0da09231508e249845cd6ac5e2a10bdf +size 722481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..409d9cb3b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b59601a610ccd3d3e794558d92522cb04ffebd1283ed22f62209550f89ba827 +size 781117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..709f4569c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9adaf371aab864e3750653040135247afb7e14eda3916d9994556e3db6a54e45 +size 742629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..03ef736527 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4716638bb1648fdc697283f50f8b4c2c8995e6c280eb9c6f185cd3ab5622d51f +size 762469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f9973a37d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc274cf5e61711c549193bbf1418a53747ed8600536b218c75511121a5c2bcf +size 721661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06bbefc4a3..9c1806d637 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36e4b279de6565ada8226697653a18532014f907d596666f12792e1cefbea085 -size 897817 +oid sha256:22f5d40593b594c68e131db0248bf1f8db44194d8699627cadbfc872d35b6830 +size 911433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 639eaf447e..2fdc71fcae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34c837f389d8d827614f70a3b5d88d1b79b412dff07724cf40a2ce253f711b7f -size 823175 +oid sha256:84423e7c9870b367107e491225f325be48b03d4d7aa61ad52730b0b0ea03a970 +size 836149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a48deede80..ee58ae6e01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ee1017093865ef9348915a89ae054c9a3bd6daa963a50e97e5b3bd0253b94ca -size 892287 +oid sha256:4a9b3006f536edc4f6bb375cbda7b05a023d3b7902fdc7ca2ac75720fb91ac11 +size 906247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index edaf69eab8..1723efaeaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:135b75c6f8ba2521d11d752e635cc59e6bbcee326c6f9944b1ab8b2f412717df -size 842063 +oid sha256:6f6442632a0f97dc3af9bb563b3942bc4174e79173602f56d35ebb2001f347ce +size 858787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1c7ac6c739 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d4e5c301a6e826b0221584e4750a4d0e0c32db84b3ce8d4851efb41f9e04ca9 +size 851859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e1d330e692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f46d5c02038366d1ad29e8397e80ed2f28598f46f5d6780593caca5279dd3cb4 +size 748053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a9680ac02d..ab561a5c13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:357cf5b7a59773b872a85efc05883abba8c4fe49f95d8bd7f56eca2f73cce244 -size 985365 +oid sha256:bc74a80bd48a68548adf4f572c87bf1b7b6e835397b3e546e2183fac61b3f648 +size 915755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 959449de39..bcf4e529d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06b663ecb82ec54a8f130a99c1df41075f646aa916fc6a189d76ca5c175dc281 -size 774013 +oid sha256:9bd9848a2d5903c3ae46e818f99e0afb3d2427a368092a8223817e1122a8bf65 +size 799961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4a0f5c7d28..3753cdcce6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:269ed6e3e190b5b765cfb28161d5f5d4afb249337dacd62a98c6eda6ff5b6535 -size 922413 +oid sha256:cc289a900a18e11f04307d235c456df290bac42da55f99b3ab5b96e3fe3a9007 +size 834057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1c15f2ba66..93f8eff8e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edc3c59c45e4b1a58efe01040275556bbc7805b97073b9043a43be7f6fb998f4 -size 725615 +oid sha256:e5aa10d86bb1105848f0dbbeea58bd1a0ab41dcc1f4888c69fc8d316d2d889a1 +size 737405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 408c9246ae..20c9c1ea17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fbbc87357479397d92a1897751fa4a76f6c8ec1d156f2c31657f1445af859a7a -size 885139 +oid sha256:261e2a2f6da94eeec63d23a7df0e14854490ed17c3bb11691a1a536f3ec71e22 +size 889727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3eca2faadd..fb0c56e517 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51fe852b7605890f0333b866056015f5f716ef476cc9efc1bb97920b334f62c7 -size 810447 +oid sha256:e5b9468340e6d5843a4af1c56ad2ffc1bc28d39d8f0b4ad7525c6a13fda1f12e +size 815183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8d2cf0bf5d..f3478b01ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7287b02acd784a8439387735ebd9e7b2854bc1cbdfe02fe39c4c2126563eac8f -size 879559 +oid sha256:417ce7b6c2993aa6385a275ae83b9fd48392f76a44b92ddf5545c4384cd2b789 +size 884491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 563f9d7341..75def8815f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5276132be5a47900c7ca160851dae68384593b5acb413849d2cdf64190761838 -size 829385 +oid sha256:a7090f06dab608449b5ec27ec8b5b89fdefc53f74a25e73fb1ff4f5d3e28492d +size 837031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4e4adec999 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e965ce645b51c740d3d5178a028a7d8be5b93583ada784d028197fd3b0d05e +size 833211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9c9027248c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:179185715ab6aee3bff796c1f693c0b4c5c2ffa5a7f26f146754f32c90151a0f +size 727087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2a9c413a3e..113250c0bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69b2b140fa759c459c104f3c5dbc0434a7754bc5d69516bcc66a25d1d1adb347 -size 973277 +oid sha256:92512b375a377ad4eb61db8fbec1ff6dff7efb555b4f4e29774cb4c64304a689 +size 897057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 107676c069..3e65106982 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e1cffa7b2a2458289b5aeb06b68eb13112b711b06db348fada30e83a8972b10 -size 762913 +oid sha256:5b59cd4ff5721d525fd160f9b924d3a81ca9d985e48249d646672e750cb77927 +size 778205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0f0c9480d6..af1e4cd89b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2886de9c7610887626e27a7d02a046e5fc986820d48c993d672fcd2bb9ddb1c1 -size 910623 +oid sha256:326303a8408170e6b21337e501195e158ef751fd51f601bf9a74767037e01a8b +size 815311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8e4edcbd45..04748bc335 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eed1562525ccc332535b364ad27d30f99643910646d8be0aa368c0d9612f8345 -size 712097 +oid sha256:5249a86ffb3f7ca8ecd6e5d6d50d1b69e87c49bf512ddff5fedde295927d2172 +size 715699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9142cfbc19..094d186f21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fd63e74d385fa78046c7cbf17539af4dda62a13863d27b8ad5ce6587132edce -size 931349 +oid sha256:59c05fe426b70c4a87b1a4575d934285e195dc8d731c1ce0632aca7f99f54be0 +size 920101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 257da309c0..b8bb05bb95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92801f675f13762e1cd3699ddf5fe8e8b5b98f478af6e3334e5caef60bb8d41d -size 851033 +oid sha256:3e775d3e1ce9e0edbea863cd6fdd8b339d38273ab23642b49c0892384a77661c +size 863415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 77ad421609..7ab196f52e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2366d4c17ddd631d41a8fbc993579a4503b612c53b3e242b37da71355f4fe9e7 -size 925769 +oid sha256:4443dc89583a00fc7e1deb4778efbb7965ae1020d4a516df488044d2ea670e47 +size 915655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3ba7990b8e..d88eabd271 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5dd7fde47673256073785f70205568ae845e2ef6fb601404fef37a69df23120 -size 874755 +oid sha256:e6b4a0c3fcdac22a4cd6e54f8e41a0850eaedcbfab6ff7b1c3382805987f3860 +size 865827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ac22352556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103889ced765ad717abab29b84987a0952bfef37e0d1ec89aa3de02b96fb414f +size 866149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6e1102b8e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212b9ad5350ddd2c1f7083bb71268b0bd88a3f77b2653a23f40967be86f96129 +size 810741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5e4720846a..df67374887 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40448b1dec359e46051244b6ea78fa9ec50b65490c972ccff33d9e2d0c69c449 -size 1007895 +oid sha256:9ab8bc3f4cfd98be022fbef6547e7fd470cb145c10245698d63d194eb92f69dd +size 895019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5bdbcf7da3..622fbe7097 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7e4db1add7d60a0318ff2c09af8d75a51223fd8bff15fe4f93bb82926c98f97 -size 787859 +oid sha256:7453fc2bd923a90f7c7772c03eabfaf57b30dca102374a96e2dc13f58e129d3d +size 796295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8229923d36..99cc8f5159 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10a1552728953cb1aa922914370b76dd8dfda8a7a7d8e8101e3c2f1d8cf664ee -size 946225 +oid sha256:a23bbe37c6a425b7e33ddfc7433ff041d61f6244cfe90f209883747e96c871da +size 835127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbc71dc386..539b864393 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f90c60a407d584bd8b7262e83ea6a9082f7b7689af58038089240b4f7921c39a -size 733147 +oid sha256:1e3ceda7bc3d2851ef032363b00e2d944e9f3eead0ff5b566441b04fc446781c +size 746023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 16f187dd01..0315920e8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d47901469056a1b1dd05eb72ea19f593f95bb68cccd470ecb5915a1dedcc7f9 -size 918621 +oid sha256:2001b8016377d89b28657e7c90633a6ef2e4a5261ca27b044fc4565ce0e9e87d +size 898395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4e53ff7fdb..1b54d36c55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94a6293ebd3460461e1f7a6db2dd41fca7ce53bb5aab8d8f4f090c3530f61377 -size 838353 +oid sha256:eb2785a1e734fdc7d5573e1ed172fdfb6bf5e388260afa380f8a04c466397c23 +size 840869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3b68b755c1..28c706d66d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8e893851d86de7caf282bbc2fcb8188ba6dde29c189ba17c619a39389f26b02 -size 912301 +oid sha256:3810f522de34e286545c8b6aeca157e94e4bce8667708cf54533ee0e841337ef +size 893159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7dc42a05d..967db74e4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65bdcf3a0bd1f012d2028d1f15addaea494d36fd7811933cb80f9e1a6313173e -size 861287 +oid sha256:18ec09e2175f95ed51e186243e29dfd8b2318751630800592ffe0578b816a805 +size 843331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..91d2e431b0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bd761c99c3b7ac70f50e3d409b87f28a09e180124e08f979dbf25f7a06a0ebf +size 846713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1086601b50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cbe118f140d046f6fdc94a3e89777425c021f6a6a6fb6c8f397554df9f883ef +size 789773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c74ddcc217..28e6b7ed51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64b56b82bec459f37268aaed46c1c055ae8687d5977887b1bfaffef64fdf0788 -size 995759 +oid sha256:e78c26bb3d8f82ebc92df4ff9a3a5764c2ec7bd963083ebbe8c8d04054178abc +size 876323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 69fb5165f4..d45335ed73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9d2ac0106e961769963dbcbc6430eaaca0857d71f21ad41ba23c54950da021c -size 775181 +oid sha256:2ae4ec272616e7cd4a4aa27c67c70be9a859e9f85b7b2547d1ffc16f03311378 +size 773751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0defc2cc51..fb1ccca3b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8990df766fa4a37e08a999b0674821e1d6691d9192bf9b886d5f7cd8fd4298e -size 934435 +oid sha256:5f21d0a68cebae86f74c89d67dfd8800eda919547a719ef88bfdc231979a3ad2 +size 816479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 533635316a..cb01d85bc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98ab7c0613c6507ffc88ffd3a31c6b1c00db3bad887c34a1635a7dbc86ca6f7a -size 720469 +oid sha256:0b9dcde0ca98fabaaf13086019f950760dfcebb464f7433879706aa71ab69302 +size 724317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..db82026599 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448d92d765f69417e5766f1f45453837cde3f675136c8cabadc08e1d0ed27e41 +size 883085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..99fdc1be82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0675d15c6d1f1c67ca01cedc0f4a20872df04d0d7e3a02bf2424587db3ae6a01 +size 768327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..46239b7bf8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a981e498a5a1caf0f56d920e41329e01f563891c6062690c585ec6981e3bc01 +size 882539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..643e29dc4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eac135a2709145c98e679783dfc874e83d4e5749e7eee84496b1479620a3746 +size 775823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..44433e53c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5febd77f78283156c0993c60ab2aa336802a96b530a71ac21eaf5ccf3c293a16 +size 1013801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b716b22620 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b19c4ba8d0181be87f1185e303b3d24bbe0da68ea98b649748afe643dba40d +size 902989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..da13eba254 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc5f7a798ec55fdda9a254e52f7102295c7d1e35a1c4962992c39dc8d051b651 +size 887231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4eab188444 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afdce450e4d464e04ecddfb8f14ceb7cf3f23e6cb0434a80e5ca9a331bca35ef +size 844943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4afb25a44b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f10dd20717378dc4ce3be9d9e3caba57ca3d0d32f79447bc908d6f272392c8e +size 877413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c765b357fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f6e8105ef0624650cfd18151dbf37f263231fc7bc3de38159374d98a30e3e3 +size 834929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c7da4c1be6..ca1d8ac4ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:074b63ae8e34f64e343506f46191a1fdd7c8475d3f705c6c2bb2f52b44c6cdbf -size 1885719 +oid sha256:df82fd1394058e8e26875389fa260bcca2040ec1fc1a8141446585e456ce1cdc +size 1047743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c9a1a2f48f..629f42ead8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51f522ff0295ed3ca4fc72cd26c704833f4f50570ddbf36d7ba433ef65328b56 -size 1923601 +oid sha256:f7bba9afebc0b81d1bcd802b2e94d7f2dbebe58bb9197d88fc04eee4c65ab5fd +size 1037477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6a879b94a4..f99fdbe1de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa8723c87721795ace99ed886e07f81705f059e2b5f96094b63d19d1c888c934 -size 1211313 +oid sha256:1853ce023f214b52a7f6e9f62d79759296ba0b7fd984f8ef914904cfdb1cc9c9 +size 910775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8e5bf3838d..3d0de1ca4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9fc67e26aa9759c4a9589e7e6fed9d1a6dd033d69531d3e5c3ac36dd7df9754 -size 1093497 +oid sha256:d742f14824e347b925934e6355bd59c5a91f4475bc4fdf07055f04c1d397c8b8 +size 799125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9e0c11ac17..f834903a45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cde581eebe2c8619c69282c813da5ab85174eefdce7f1d6b4fccee2c79b8e3e8 -size 1878567 +oid sha256:4a6853e6cb21b4b18e861a62e02b2da3c89669b464f0c4e944db30fc173e9ffd +size 1036891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c8b549498e..12b628b109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7175f3d875ad8cfd9ff0783ad9757c1131991383f47607ffc9697cd556dcf65a -size 1916449 +oid sha256:69655be563b4caa4bcdd8701abba7324e58651e6c5ac6fb38c439a661bfea4df +size 1027413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fad485645b..c069f25adb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd11d46fd080b2ea6b7194ba28303acad6c549557230cb1a907340a04c0b2c26 -size 1204851 +oid sha256:29f8ff1f79e2aa652201eec9bd562ea1f7bc6b33eaa4f7c8ab7dfa9949ea10c6 +size 901007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1505552108..313f2e58c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:840560aff5736045dd2ef72f1a3edf62cbd0839ac4bf237a59a0f1512e7cbac5 -size 1086393 +oid sha256:f8a1a2f882097c910dbf91aecf72656c7a8281c3696fd0e9f8c4f6f802971ee3 +size 788271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2a006c44e0..ab559e9d79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de7cc9f5aa8a98f03d47f4bbac99637d6d9e5655f19a8ce199db501f43108e8a -size 738667 +oid sha256:3337b8f6a80d0812c4761c7c693fba24c5bb726233d5b17c0ab91f3a594a90bc +size 741429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f6a4feed0f..8a1e85c59b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f7e832e9546d1730caaedc354c4910ea24f01c48e9f207390e2f20b2ea4e15e -size 702009 +oid sha256:053ed566c8d82451a74b830610e563183a24628b8c0542bf20db7f3d687b449d +size 706647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0c2e260a5a..ab7623b57b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8f5db082650a0b6ec0fe3f66b9743b0193b364ce7f7ed24a5894cf4eb101b6 -size 759381 +oid sha256:d5c5d4d8ed1cc5c5654b80ed2c677ff291f6108b46b41e488d2a784471494899 +size 759873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 99002926bb..490e473e2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e54d6377ad4375e5bd2c32746676af757eac58f9db399080a6266863e45be96 -size 723415 +oid sha256:eb2a6e19c2354048bc4a1b494ea5e499ed55c86ead8880f0da1b3a7094f1154f +size 725979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0065a9ef13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a42c6b155c1f98f8fb55a78d25840826d5d96df3725977fcf6fe59c4aa1188a +size 955703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4dd10586a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bab716440361fa6f3acec4c22ebadf2515c453f099b764775a975c90ab194038 +size 846421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b55ad8a0e3..64770a524c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:709cbe12bc688bfb80d89130fe3233a5a71bdff46aaee4da4a68fae90119165f -size 818271 +oid sha256:b3bd356cfd31bf8b86a5a5c60835872180e719eec19f9c1f4eb1fd167318b43c +size 757541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cb8d297259..a77db5b8e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe8a08e39129a8c26ce9036af65c29d32af2f9bb5fc16e6de74825ee7fe70e84 -size 651615 +oid sha256:d369ca602a7a2bfc423a0afeddf36a3d801861bb2ec709c5015f6a5f328b6bfb +size 654723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3402733b51..f20087856b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea175b36831afe39752c759764e964cbdcb436fab5fc642eea6fc72cadf1a8df -size 767849 +oid sha256:25ad3f676f623f1586ba12571d98250081a3eac166eb72ecb7f736a8b9a996f4 +size 717925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index aba329f234..4d8506b9f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc6ada1a702f428adabd964dda38cf042b261317f7b524ec168ad42eb1ea2f69 -size 614068 +oid sha256:176242ab09c93b8ce56988fe5a39382afdb9da2c650a9ab068cf398e2a2ef789 +size 617769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d870429c5d..0302dce3af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abf75860ee5ff50b6edda29e3ae1cbee35c771327ae3218f3c5dc72f30447465 -size 730723 +oid sha256:2eaba10c7d0ae6fddd0512c464a58db51c3d9a1d07fd2137bf98ce80580e30c7 +size 730575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f8d31e75be..d5619344a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01fd9b8e56ccd81ddf029540a2ab5a8fc84df0b0f0d9b66686226e0a4d951495 -size 694067 +oid sha256:6c704edf549421809855575c29cf44cf082203d4d1534f647a07cd45c6b69563 +size 695793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dcdbe014d0..6f02fa111b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25066e389f7e760d570d7cc7140fe601b8d3b2c820203527d3cdab6b37011c04 -size 752227 +oid sha256:370952eb5c129c11557c1840d770a0705b9eff11c033efeb8c887a702ace1d73 +size 749809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f9dd9ecf2d..7e7c4a3724 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7580ce5378525bd2fed9c68d6d802ee37aa298dfc6ab17b6796388e891f868e4 -size 716261 +oid sha256:a6ffdc6d7d61d3425cdae8955b736a2c8dafabbf3c4375471c9acb2dd8787ba3 +size 715127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bb9b69a24a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab9c934cd282f9cf87a87252e93baaff3f9ef0eae32f69fba817fca06d476537 +size 945985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8c33774440 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd21c6f478775af366fca10e101949f566e29b15dedc6467126f3b05d81dfee +size 836259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8081ac78f4..f6ecf090e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e8b141959eef91934d5d6131b7e6137a20d0b21db1f227895210ff3db68bb29 -size 811955 +oid sha256:8813aac386d16a3012f7c8ee82d88052fce9fef898c22e4e4139652ae8a3313d +size 747723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 24e2d86a89..95660f7abf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f52db3c70a6eaa13306fc59e897e5b98bb7676f9d48efcbfe040c81e04f9b870 -size 644461 +oid sha256:91b0ad89f4710c812b6477ed1bbc34a088f9dc8e0414afa653d66c4f1ffdb666 +size 644659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4a67d2c84d..db12873c6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ae90b3de05b2c950e7ff246c19d8762d99015fef81b8737578ab424a3de835c -size 762127 +oid sha256:8aa4f4b477954523ee478c390ce1160ebf7ee6bba66c4da287dfbd4c07f69f96 +size 708157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cf6c441e2f..6274f209a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7eee8e9a1ce869bc66f092d268ce7b7f3201593becbc89e46dded36fa794d5a3 -size 606964 +oid sha256:1f7bb338118243c49403f9948d3c510c20d53d50f7f409e726c019f7ce36191a +size 607704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1dd1e03e77..bdc2eaff59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb053510ab7dfeddd2df93a40b2c7688273dadc4caa13f6b500bab9d85ac91a0 -size 767117 +oid sha256:747e783299f4fc284bae59405ccc7b6c6839ebfe9be7b374d1d6f55ca49c4bcf +size 764353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8520110acc..11c9d01a83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a30b8cc5fce120e05a37c6e86dd6b92426d2c6675bb869e1dddc2675d951ab2 -size 728091 +oid sha256:526c2aefa8773bccd5e833fdd097776e73129e4bcc6c5f0f25bdeb3389231d76 +size 726415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86707f7152..1640c25009 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3ff0a4a5aa30817748d0b54ed22a204a538d515495c68d082242071e6fb12f0 -size 791283 +oid sha256:b22209a6690a13b106395df863bf43e69e8d5319d1eac27961274e7468664fef +size 769331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 232f4391c5..1d236b6be4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7db695b2113f759a523969b497e044b9977dd1cfb51bebe4246a3f844878822 -size 752949 +oid sha256:319a686374d7bf8ec0074553890ab29dacb53e1e93119702716570974e07a8f4 +size 731441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a23332f61f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f28937dd7620010a402e04c7e6155703f5a19dfaf23444ba5eeaa1eb08c25a88 +size 1049915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b3f151b2c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c13061392cf06ebe73837214a1ee3e7ee52b9f37e46825ed488b17a0b968d7b5 +size 974475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index dadc0c2359..c0f95ce4fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81d19cc2a62019389bcc79325bbea1d56cfe0ef5ff24bd7c37777aefdfa93131 -size 842477 +oid sha256:2d13ea5873fd24a68d056a8930a940e3328cc7dbf140e38c394f16e06415c66b +size 755207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c0395c1ed..cab65ead24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6814238d5898d7f44c3087246487049a3c5b462a0418347ad5d58b8cdd1bce2 -size 670149 +oid sha256:d4246fea0b83a1d37cc479c36c092b1d7a6599f1bf6fcf90fb4008e047239887 +size 663883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2587eecdb2..65e415a824 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36cf95ca347cbc28fcd208a5b5f7b584a06d76b5a6f4321f77e194697c76a21a -size 791021 +oid sha256:3c62e8a7aa6dea890b0ec060a41fbb60bfa303f0115df3d6b2fdb1b3c7a99ab8 +size 708585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 256406d2be..7175e2625a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5cf24d0d994ab98c554af5ccfa4dabe50abc04caf31509064a11e97a3757115 -size 629447 +oid sha256:41d39ef48d3d0d70ad36cd8ac00164c51f42120ef82f50acc252d68908ce7c4f +size 625155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bbfc892b49..1cca59d03d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43b9e452bd0fec211945a68d807004ef3d58b5b0b555b1d70543f980a5369550 -size 759963 +oid sha256:13200791fce686b07ce2ec010abc3ca39fd0f521cba45ffd874be711227ec3fe +size 753501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a92b37e589..12b1e2004c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79048fcdfc539d49695b89debecf1b9aabfb3caff856394c6f40af5c76d62937 -size 720939 +oid sha256:3a75c2b5987965042b44aa7f354cb4fb29c14837e500d46a61e693ffb6b56497 +size 716301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ebaf8864ae..ab0696640d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d99c62581f0dfc877b9aa984db40224f1d2a63c05da76f3def1275ce6d5aa3e -size 784131 +oid sha256:d6d53f31d96f9680d0737c03e45ca371330fa5c474762f81b57c559dbe58629b +size 758477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 34237f3928..422fdf2c36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e179872ce2bf25ef813b796f28b7c28c8438dd2ab7c625f709d2b796c31599fb -size 745797 +oid sha256:e07dd711b08a5bf4fc737070f2ef876e99aeb0d1514708b316d3f5ee0fe87d75 +size 721377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..024ae92e04 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d090c9f68d55e88c7950d8299358952fe6a5115c314cb7608e766ec8684c74 +size 1040195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2345370071 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde6f84532d4fd3ba77c6fc441a4dd26d3dc942509ad1a1f68db41073545d823 +size 964411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7f29040004..4b8c7f165d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f756534f29bafde2ce4dc84ca272c7ab5e14c554b63648b1cbaadb40b870381 -size 836953 +oid sha256:24492d987c459fc531f4375fabaa749608419cafcb1cb02a869eefb4efa8a6f3 +size 745489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c3cd15c626..9721b58442 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bb7b9d50ed995d04cc9d6efcf38748a2bc76448cf358605bc28a365dedc7d15 -size 663045 +oid sha256:c4bee62432993ab84c7c2e64f2f495e76b63a42154e62ed8bb3f3af7f51710db +size 653819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 56f7dd3cf6..53b4d575ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40405b55a3d599ecf9299dfcfb7b95d3259ce96a487ffd413fb3e8728e72e10b -size 785347 +oid sha256:cec34144166b64e3c773a03bb48d434cb5aa38655480b8aeb310f94a07a4720c +size 698817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index afde18ce89..7373bf5a0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db144bc2e82ee87aec2a62ee6f7bd61606ae5b5ac4e76f0241a95a9b32ee00b2 -size 622293 +oid sha256:6b20bee64e3de7f99ee83dc67c8229cee9afeae42d00c407040057d456ea4e92 +size 615040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..153366df4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a9b612c3fa9d9f4449f10254b52f3fa874b083f58c4015f5f94dd7d6d52f505 +size 936873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6d9b4962ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffe51f665a7bc92925991fb9e731e37af45d6d09f9af98e00cd86a81ea82fe6e +size 830799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f16a4f24ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc05a2c0d299ae8b603e14224e06dbf70f7108ac579e96658228b3368aa96b08 +size 935539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8cbca8af1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4f3423baf2c30ce402e9bdf1069f2c86ec2fea13a526f6c566b9b387ece9449 +size 851615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..63e3982eac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace31233e2422b798ae415e246996b172b156a992bb323f5bb44f3a563851dbb +size 980683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ec2ef2e841 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51480f9f4e540d4c5e2561cca6d779457dcc6eda4e9a641b5d75a5c9c949e57d +size 930947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..96b3607069 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be953828d338a2f2af97dcaf72221dfea289fad7396e53874fd95a393d47601d +size 967215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fc3969a2cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a66aac8ec8bc3588a36b8c66c04836d0e95652ea0970fb547c7a9910feeb42 +size 916839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2d14a1a8fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff59ef6abaed094e3dd08edb142ece9f7b09c31a0585095ece632e9246b5349d +size 1048465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..eb93a2def7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13033ba13a7d2f349471a5b0862ad677fa5ccd99b8ce7378cbd2e44c3c18c187 +size 944611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fa3777b700 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:893a3ce916901ab92d384c0adb0ccaac434b14caa79a9dccca54306dcb816931 +size 1035787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4d9f009e78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44dd629666149c31f863698748dd76e3137bbf0ef11fa5c2ca026d27e68125a9 +size 927393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b5328b72ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec87dadce19cbbbf4b5213a422e92e33ac5d5a13b0e7299536df4f957f9a1163 +size 936877 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1c0ae54f7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d016e9f1e59e42417b4a45e3bd156fc15f06edea366e5f870218691cfe9737 +size 830803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..011ae4633f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84e515b3f856730bcbb78c7ac271113aa9d1f464346b39e60432a81cda2d0c78 +size 936333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4e54985e9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c31cbead97542184c0aee709af55e369a61a4c5f7bc8fae5797460c87e1cde72 +size 851619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d7a283cc6..2c3c9866b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5244fab53bfe14c06974d2576dd1009cd023d3225adf7b79c8c67d485b5fb5ba -size 960161 +oid sha256:ae58ad590b86c308ca26b842bf9281fef61f57af3f3e6fe0f20aea1286adcf17 +size 992623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 1ad373c142..1ed672a0eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5c123171bc10e287a3bca73a8fe400bcca87579940dfb690c3335bb377c4786 -size 922913 +oid sha256:85559c88ac6f80afe7a0f9f4ce153841f72d7278311cc28ce65d60b93a4fffed +size 950145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7993044067..39343e6244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:762babfd8e9fc085734a8b3bd166904010a8909c025cfd61758402c5e114daf8 -size 857005 +oid sha256:47cbc5f6d899dbe030f6e7b2ea9fc89239cc3bb73ed69d5bf1b32c0f337978fa +size 812605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2303c7eb3b..d10dbb3ead 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3482740cc34e30badcbbfdbb04ed5c2017270cfce573088b700695217af1e049 -size 899627 +oid sha256:450e7412f6a9b040c6d4a533873d5a0c8f9e3acbda7c2110f7f1c2e7ceb4ff0f +size 932533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f115ba4a2c..d8f8d3ea8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3121e6ac69400ed27d4444db070bee399fe56967aa492c25e5ad9f77f85458d3 -size 866029 +oid sha256:17d94a9e59d4c373dfcd01860d57553e46702a62cd5f93b87c4ef8564c39fd88 +size 894445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2499a5005a..f9b9b07392 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09e7793c36238e6e34fc96ed5b3c513f06e6fef2158177350bcb790cf213db40 -size 967407 +oid sha256:4a10165c7c6c6b5189769afa28f20d5ea8861f3417bf5441f1b37c3a6e50c41d +size 999079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index e64ae0d05c..17de69fe69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:675ae43aa0ddfef14693fe5e2943a3e18608795cc938f6319a74da78139c64be -size 930949 +oid sha256:0592772baaa8ada8f89616a355bde9aa3d3887e870bd0c013f4b15e6eb8227cd +size 957441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f580664a48..358998e5c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:019674b1a5c39b369d48b85476a11fa3cff1b7930e5060096a6449c162c8e05f -size 816201 +oid sha256:dcb1fcc2baae9534b0640974e1d4f03bf2252113251e74c52ae5bcd49a682ea3 +size 772739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8caa11b367..070ac9bea2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3588fe23a465074b07582beced95b9e2ddb414b723bb26b811bb36b6d88edcf -size 910377 +oid sha256:b7c75b6a0eb18c701a455e5c0fdb5016eee4df3be43542cb1de2d8d3430fe8d4 +size 945157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 96bc5c4909..03a663db71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daee45cc9d6c2cd9e425b0226f3c7d491a3c9d7e4b7137f30a14770cbf3fd26d -size 878307 +oid sha256:9d0b993b08ea25717fe188a97cb4379ad6a06561e856667d2476c363865a6b74 +size 907019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 16ac8f3ea9..1ca51304c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b16f5076774803c64f809905511eb9b78153771a6ecb2b868376d42bb4f39e08 -size 1073707 +oid sha256:e704fdfb814561864882b0e9608a07d54a489e129b91d2d0cc28bf37ef0c24d5 +size 995169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 037d4d5c06..f31c08d904 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f220f9789b755e03f62cf7e6ae8d7fc8e02189ed49b5a40a3014194d7cf66ce -size 854955 +oid sha256:f7761db941e7953b8f86e1d711246713cebf981d500eb4accb08dffb93d9c5ef +size 888009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index acc11f9b53..bb4c107771 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c2776d9b26e93d33d82c7c9c2cd979f57ec24bfd197e0eaf477f1bfccae14a9 -size 1042675 +oid sha256:fd559b4c142f449dfbd6ee59e05436729b8c5734a043548a22b864e292733dfa +size 957179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 2d71db15bb..3a91935e80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:861ae17159b8e72f64410d0d68769daad82a0425b0f75972456df91f597eefe1 -size 816917 +oid sha256:72e7d106382265fa8a98634ead650bb66edc36be8eb7e19512c27825f45aa1eb +size 845531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index c0d8bf67fc..ebd6e70107 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8df973a68e70a0b5faa01128712a492ad5993561376939ec620f9bd315e83d5 -size 886931 +oid sha256:4691ae40d17af7c5fb996df5dcca1c716ac3d73ab40e980a110f2ae15b3475ba +size 764387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4c8e7ce88b..f1d7e91dd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb12b17ab77f121b153998bf481e24b5c3890e331de9b1ab64b1c89e8cc61c4f -size 710655 +oid sha256:f1ca6208dd4334336d554e7ad43d3aec3a95ef5848490b09b9884e2e2734b880 +size 665317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0733976a53..8ea4143d19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca1751e094306c6a9f8ceec10e51c2f7a9c98cb4718eb34cd03784b0706923a1 -size 1024717 +oid sha256:481975ec86dc07926c25b4e9d17d2ed43372d3e2af44a0d3b1fecb9f79787e33 +size 943711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a0638050cf..e6c2eee134 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6040a5aede300bc198800b8da037412ad22238b08d2736d2af9844c1e386765d -size 807247 +oid sha256:0998a31f3a551e47d82f0934dd8542127fea829eafd23c045808d6c61d79ba91 +size 839807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index bee14b2ca6..cd9c80a9ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42923e99badb7f814ab1bfd603193967d7ba183bda2dbb476a43ecd56218de46 -size 996447 +oid sha256:5b78cb7d8a87d1dae81e6ae8adb4259daedda63d1d8aafea1a4fb4539c7464ba +size 909127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 3782f52529..9ccae84e16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7dd569c13d5a9306b360cad9ecc4a9bcd01bb6b6cb0462188f2a35486541ff2 -size 773501 +oid sha256:80b6341d61aefa8513f7ccc065444dc4aa437dfab2bf70e6663515cbf0555238 +size 801621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c585a8135f..b437cdd890 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec6dd90131ffc3cab64283dde981a5e98c77a9022135784604041d64fd73fc92 -size 940329 +oid sha256:a0dd91a29acda0f26ac0e95dd1770a69bdd15748d24e26e63ddf58fd20ed833c +size 960063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 286cf3e5ad..dd928ac404 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18e15ba4f33b52d3225c45e7fe8c85e6e0d9be1ff6284e87429eec2be9ccf0fd -size 911863 +oid sha256:b44363e35dc83dab732e0264548adbb45dd80e3722560ed1d17517ab96913d1e +size 931941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e4005f0309..da68a3caeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8b2de2839c6deddea540c7e4ee9f6a48f4f6a4beff54c99f8d60ee4374a82c0 -size 837125 +oid sha256:1dd92362ff8cf10620c46fc122f6f5f14f71585f48f5aed13a0343af9d09742a +size 780095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b4af06142c..566c137b1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d82da8cb45c368d32c9e6588e9ab39b20336b5fa72244d3c338845187e89d02 -size 879747 +oid sha256:5ca7554ba5c8db26d9aa52a52f4d58c4e91bf74cb49b3f9fb9e53b290d0caa1a +size 900023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 7993687c40..cb7a612d57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8533a8f3fd45b17a5c2c2305972ee0e47eb310adb1c726b962f6f66341568a7f -size 854979 +oid sha256:dc7475594abdfe53293ab1b94682ce1418dec0605604ac7df012941f5bfc0009 +size 877031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 30fb00a04a..dadee9e5d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f904a60ad96dbf3ce98de05b5977836e9a8762287cbf96c522077d0ff512e5f -size 947575 +oid sha256:b329ffeca0e71da5bbcc31b712fdd3e67603960676ce51a78cdf18663aed6ecf +size 966569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d24221bab6..b4abeb2485 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:575d9fbc78e1ec77c1cfd409f186570de3768deb01385c3be37980e31c5f9bdf -size 919897 +oid sha256:cf9fdf2ce0bbec2c05c9d766fa9fcfd2949e69c786850def3a5c964fae8a858f +size 939237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5f7cb54d70..67867ac816 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c19a41ac3673320f05859273992790f2f89d3bbc9db0d30f449bbf420b34d671 -size 796319 +oid sha256:4d3e834e3e4803e13687d39bbab698ce006cc5fa0f9e3698879b6cc6e8ea0102 +size 740227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9cff2a7084..1dc5c4305e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d13eb6a0ffa32a769524215020b8b7e1313df41943a7b230d54cd420408668a -size 891333 +oid sha256:96368873692e61b7ff3efec5d9c106f7c1a100e2c02e0de8953efa999ab3af42 +size 912597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f3e4ee8510..a40c169000 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d80c4e54ac0a2046d1d4f92de3220b44dc86421f1cf92dcbcda8342f50d3da23 -size 866467 +oid sha256:0427877d6c4c841492ce5b2c85ca0f3ff203be434c58744f5bf2dc4194b83305 +size 889605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1ef00ca74a..3efeaf7dde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5516911aeb7129ca59c489b8f7de6ae0e911b5c519a700dda05888e51e64c5d2 -size 1056243 +oid sha256:629646c2cbcbd725ed923870430271cd769dab0b1c8878a11c9143f19f9ae7bf +size 965173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bb8b0d4b4..aa69a373a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aaae4363805169c0fa8e62a43c6cc41b694f2786eb6ccc9cccfec645ab0ec43 -size 835073 +oid sha256:6e596e49c2959eb94d608a6b9daeb7b8358717e3f54b2b58be47d7924def121c +size 854709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ae3188e382..8f8cee1a59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae891a848cdadab1f406e534fc19b447406baa66cf6d0018fa828922033cd172 -size 1031575 +oid sha256:0243bd12d0623c01179265cf10aaa790face7224368d7dcd95c663769be15a0f +size 941343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 87b91c53b8..4c3b5532ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2b1415fb602ea0175229d2f71b4ca21dde7798d7065d27f72c3000baead9645 -size 806655 +oid sha256:a4787c8f354334588d7d60655fd0f0b92e43c06347be8aa3adb2706211ae1f60 +size 828165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index bbb86f1546..70b43b480a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e285fb8f9926e2e5844702589d75d5b30443f2d9076f4faaaf4914e6f2a334e5 -size 869467 +oid sha256:cddaf2e83c5b72f6f2534793e89c05408a3ba891ac235c1aee2843c150cbade8 +size 733505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4e5a22dc68..f77732ffa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e356ed22f32c9cc0f91c207db7351b3d8652086f183e749add5b37d746846a7 -size 690823 +oid sha256:81878d57c69e6639743454f2ec4b41a32392e092d99532c62c93f3e36c7fca73 +size 632807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c85b6704f2..055acda2fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b58207de55e9c7c36ec790308696fd2897052dd32c1ce5acd5a6467bfbbc93a5 -size 1007253 +oid sha256:76864937974b64d18667b6c694659001f8c534e73ab1245bc60f71916b00ef18 +size 912927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f79535b937..d40dcb14f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96696137282891a1a2775df7b3a5ae233ad8b404a3807ecf3f645e764cc8e9d2 -size 787415 +oid sha256:1094ac44ba89124bd23ba7f9c8464cc589aaac5f7212be87e343a1d942cc9370 +size 806507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 3a8434a198..4a68940e10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d9de94aa9fe3ddae99ea1da1d0bdf4186ce6423420219cff1ab5d37a7aaebb4 -size 985347 +oid sha256:5976103d425ee467c48048c6753cfdf6223cde727de5f5b6fb725612eb5f32a5 +size 893291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 689b299887..b159bba1c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6416490b9c61832f93b6899378c8be007772b94f3f317311bec33a72f21a93a4 -size 763241 +oid sha256:2994a4c0d616a9f7618c634e6b13fc10125f0aa59062e67841e762b39f95602c +size 784257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c1da5eb31..f89b7053bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f3a18e880c6b84fff41cf042f78d77ba98b54c6012143cf78a3d68325b8f5c8 -size 1022173 +oid sha256:5e31cd20ae0e1cb6f2efd76221448d25de5925e949df1ffda95abbe5170df82b +size 1061345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ba1fc046c0..1d7f297bb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e114d7d62c6eccd3f244bf250b2656bbe8793f4d7cff77f5a6b5d739093711c3 -size 975403 +oid sha256:6cb594be6f8f704819a4489c1abe6085e3a5a70ebd5c07890fd0a0d94de3bf87 +size 1006681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 294311a71f..22b50ecae7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef1b85d68da935ad4949a5a3f0a3d0cbf194aa41941dd8516b30cb36bb08fc8a -size 1046601 +oid sha256:1eefd1fd9daf73cc3dfe803d63b87ea1033349537f3855286b3d1f4d65a0e589 +size 1063523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 68519d994a..91045128da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d68270d634686b6ea3afc1a4de2ea35086c54f666adc54e7021f3e30ae2644b -size 932437 +oid sha256:440e748d6730f01876ede50ebef4d73cfe0d74b72f1a38afb8e2d94785a2d4e2 +size 893167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e23461b429..a8fbd8df6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:824aea8c15b3766a72992df4118c22eb5d59d6b4ca0ea066232b48e05246788a -size 963415 +oid sha256:7d7f190662ecc8f6b1e239cc639460195a630003f2e859e30f66b022083d7ac7 +size 1005447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e0e86f2256..170ec967f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9000410caa8c33455938666ac5363786e9f3de85b4717ea41716a7016b0d870 -size 921085 +oid sha256:fde4817d4a98d5ab4028d43525cf8cab73ad6c9dfe64bae84119c44db311ebca +size 955175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0258cf4d7f..e9044a9d16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87db081c6691b37216491957fbc711b7f4ac0690105b8746e5c970df45f629b1 -size 1024535 +oid sha256:d840806fef5ae4d4e81d8d477302b81c7fb02c162246c3f7ecedae44e0fbe6b2 +size 1063953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index dfa019a493..06485ae01c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48aa78e79ea635a736ba6e06ec54b036f7690495f9b7cf4e79fd47ee6a19c136 -size 977765 +oid sha256:bd0349d70ca613b62de6ece03b3604df8840d408dfa49eb0d0347d8c9b3a7ac9 +size 1009289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index b3fbaee621..0a1734384c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95aa06537d7a230485b40736226fa487e67b25ad5eae2a86f2d14cd5016b304a -size 955231 +oid sha256:2e91e1a34d8d8b230889c45ac91eeabeb509043e984527bc81e8b61f2f84ddb4 +size 971511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a3c664f17c..8df2ee1d08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee425d9f2e594d72d649d967f9fd32c6aad87fc9f0f8cbd868a3fcd0f743d21b -size 870863 +oid sha256:c2252d40256762c7fc06610cee1fd3af1543b625a394ef01ad8a46a9eeff4747 +size 830951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c66399e7aa..d0c69b537c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc85df82e07f572041f4ea0bd1f610b68247368f0ff4435d180c26c1be94af2d -size 973721 +oid sha256:c9b7275b0ada206a0133e5b9b8c9ef17400b2e3ac486eb6822b1d10e92b9c3f3 +size 1015259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f35f31a9f3..68f031b6bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0860cc3428543933ed8521b9fd2c0940a2af747b0d4e0b26802f4761f53f8f9 -size 930503 +oid sha256:09687b840ad540d4281747cc8bc7042110e9b88cca1d5184bf0b269cf5ed2235 +size 964147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e68bb46d94..2823a8e7c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdf1aa02454f2785a0cf92b1b03588665ef57d41cd834df4e7f418c656a155df -size 1148299 +oid sha256:b99b6abfbc663a37632db8de71be3bf058c7e3263180955b0aca3a4f162f50f8 +size 1060633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9af9153535..e89683cce0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0d44f806b19126f810326491325a1b7b06d5ff7c9680d833ec86fb6720f1f9b -size 911245 +oid sha256:4e6e644e689737902b380759c1c676a9d89bba6b8d3b6aa0e5c5eaac8d2b85e4 +size 951993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 3b60f958e0..1c6d39884f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5719714aed2e14457f98b1c2949e2f1483757bf2b4f74bffadc4e4387f44134 -size 1109077 +oid sha256:933989096d959823cb5686f895af81441eb8a0732fed07bec591e43676f7dad4 +size 1009867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index d22389690b..c8e2b89f65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85e8ac9195ce900f7d6b6c9a7a9f54a60f8bba7f2504129135a7b7cda0b403f1 -size 866103 +oid sha256:39fdacaae4f6b512160418788940b6821bc3e52b4972e5b62743190b9fae68a6 +size 897281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 7fbd13565b..04dc6ce357 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1fdb64595c5b3f3fb3eade124d5892be26c719ea342eba59f60ca22ef14cea74 -size 952601 +oid sha256:26e878c7d2a4f6ac85be1027ccfa30fe84ebafc4906401d7fea8e88176145cc0 +size 979291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index fb4b10f1e8..e4c1539dd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1deaeabbedc5405a0b5a543476529fdb43fa1810afbe8b927c7f702cd866f62 -size 956837 +oid sha256:2c0935f87e414cd5a7fa8c263627bce710c09c50a384bf16e2d2b2a75f8757d5 +size 834095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 0f5149be7e..e3a8bd8898 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb8e02de2269d0947c8ee2262995819ebbc679abb80129fa66b6b7f8ffa2cef8 -size 851953 +oid sha256:14d78eda116ef2c4dd3066e88ec32dc4213f72187538b4bd488a5afaf83be659 +size 866013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2598c3ed78..2ceeff6ac6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57570cb812054bcf7dcc24df8ee160d325d0fe15d12b8c07c59bc4b21f45b4e2 -size 765317 +oid sha256:acfaeca109d8216907c3bb9acdb1f1275f8cc708db0de5503d27364a82822569 +size 724419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8f772c964e..142f3f639a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f66642f1109d86ab1ec2f71e9580b02bfab3ddd993cb550e0c42ec0ce5a0541 -size 1093537 +oid sha256:80450b2398653d2bbc31487404091d2c7e261f02953f8c1bd9c9f30c9e51198e +size 1004589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4be7e555fc..9434dd09cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c02472f05768e8f8f6e084214962a7db52773f13cea9ccaa5f7b5b7d4e600a66 -size 860379 +oid sha256:235be30a04616055146acc395883257116d6dfdefeb745614d02e09aba5ae8ed +size 900389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index aaa24c1834..836a964ff3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6232cb54b50bdf5d2973e11175aef1f9fe8da20da53c22a1559fb6e39fe9fdb -size 1057965 +oid sha256:c652d8eb2250f1f7e58178650e2381aecc8a5c109cdba17c8c14619aad3c81c9 +size 957177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0f99542c80..42b8f603ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e551d0059d066c3a28998befbf6809a44b6921be84f1ad5afe4bec7dc2b019bc -size 817211 +oid sha256:80e7315006fc587f30e75d55cba9e3be9482f40bfad7cd3c2de77ed2100a6e55 +size 849327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 14aaf560b3..e521d32fca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e59cb605f368c91a0cbdb0f8a19a1eee384089f5c85afa9b88006715955ca248 -size 997557 +oid sha256:e50a73836576674630ccbd377d1c0392c98637630e17ae5fcdfeecdc46160c04 +size 1022963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 537163cf7e..85a7a346f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a3fc50f355083f29fdcfb8039790f05f5339518002ddc8a44429303b06986f2 -size 962331 +oid sha256:1710f965fbf5728ffce696cbdc4f406e7f85cdcdece37111e7a904aa8ea2d3eb +size 986355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index e0dfde97cb..62f7495f71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f4e0b4e8255343f6c9c173f98ce99ad6755b79ffc705f396eefea9be0c6f391 -size 1028349 +oid sha256:4dade14232c46ecb36ec05e5c0f442045f4d7419a879d171050ed997b8541d33 +size 1036291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6b8c56da1c..18003790eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ca7c212debb6050591377a8350b7691360e6320128b415a18e3ebc3f6b6237e -size 908609 +oid sha256:1a955f751f37be3cb18014cafef6d702fce74050e7e1a161c277e2d4f4100377 +size 854835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 37f2c265dc..71a594f8a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce44fa10b34ba8b57844c2852096566c2ae974afd2af10dd94e28275ca146401 -size 939587 +oid sha256:58d75c3ef4247331343d7bfa104eb7c8082302edd51f5f7383d67e895f3e9112 +size 967115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e0c9507746..3c7050d95c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5e413556ebe635c68895f75366b3e689533a2808a583646b7b409681e4ea1e7 -size 908061 +oid sha256:2c3c1fd4014cda3f4ee977d342bcd37ca18aff2ff9b40370d68e78612091442b +size 934801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e75578601b..79f2467031 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dc4e36cb05efd74ebc79ffca3214a2aaf04e5c23482dbad4bf3bc7985ee76ba -size 1000707 +oid sha256:d4adb3b639dd982871abfd03dd2f05f9481c4d4ec2a0466a6d22a2aff41a0c46 +size 1025621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 019904365a..5e76b08835 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aa09e8f1c048984e0f0575281416d07153dcc2c88f8f867c9b26a2ee0bb9af9 -size 964693 +oid sha256:35e0a1ad8805bba1c2397ddcc46d33d3ca1d467f5dceda5ec4f2f891674d370e +size 988965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 93141e7f0b..9d742ae29b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:295bb74834507e28c98fd3260355d6e35f47ce014b3a85e17bfcb065fbcc20a9 -size 936187 +oid sha256:27c763999c1b54e9e164bc4ae4f4e1aa262429bc1fdd968d298bd938d92fb693 +size 944279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 470fc6a444..5073149796 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f792222dac5c5ef9cb25cee4377339e6d2ad378e9b4bef3afcc8c21d548aca11 -size 847035 +oid sha256:83a1b7c7cb4d5fdbba5a26ab4b40522e92e13fa1ca1b19b19e6a1d8589b0562e +size 793359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cfc56c5d96..7f3c84facb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24aee16491d1c3a14aa5fa6ad8f010ac7c6cc30234995f3fcfc40dfc8b44e34f -size 949893 +oid sha256:e9b994113b6caa00cc519c8a861d15cb6bc649c88557611669ab336f0b37947a +size 977667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index bd136ed7bc..769cdcde84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd2ea05765a8e24587a582454950ff4ff5ecfe06311474861747e470f4c1820a -size 917479 +oid sha256:15f9a2476105ef742341b95c6b1f836c53a0129a8be8d54392f89981efc06049 +size 943823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 61ee0a11c0..14154e43b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8921786f8b64046175fd2dc5102aa97d39ce92f034371b2282561af56e6746dd -size 1128467 +oid sha256:9c1cfb5b6271321122f39763851d6cf90531af75ca2448758fb3a60fb3266bc4 +size 1025015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c01e7d72a1..b7ddc082ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2858844986becc4dc48b9514a0fd04c38d2090ae3f495baff21c74d33613d35 -size 888255 +oid sha256:735057b7738fe7063334dfc0dcc542eb9c96c34df57535cf70531533004c738b +size 913613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 7192a05d00..65c3d467e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce9863ae27f1a47847677a0c5bc296a91be239138038ca3f60c4126bdfd66aec -size 1096695 +oid sha256:166a352367079d6e05674c889c4b571bfff5eced08665fee6e467fc685390292 +size 992059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 62b8ef7dd4..bb2189b2b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f2981e919c23f02123fe7cd1e1bf7a5733d2b8c0a47b47489471e8181fddbc1 -size 853867 +oid sha256:adb5a682f9719ed5dbbfa16e5bbb49c5404d75c948863cd764a3e56cf7da9370 +size 878583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index cf571a87d2..f9b7559264 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1096afc5913f2dafd04c1c771b98c53405cbc96a628b80b82d5c148794d7af63 -size 932869 +oid sha256:27353a008fc908e3b4da95892bc9897fa5af5d6222472a018c27423e1c71f456 +size 949839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0488f46df6..a6e3c2fa93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0ca93120f1a07c60c43b1fbcc3fd27e8ed5759fe86e3f1d7b92b9621cdebcca -size 937005 +oid sha256:23473acd64f65584aca272e4d8af861b89df0908f55db6b494f382ce036e5d7e +size 798377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 3cb95285e0..8d952a8bb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28d8dade1891312da451cf31bd83bbdf922ee9a55bb318cdc04b0f595af5f51f -size 832911 +oid sha256:488c6613ad0dd95145b1eb31bb86c674f881c7fa37a4967fbd0ef3c366810407 +size 838733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3d219583bb..9a012c4e14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f13a6a53f1acc84b1604ade50d0acd0603c134c8e751a2e0f776ae45e2b1992 -size 741537 +oid sha256:3fe74ece4e1c3e9e9d17237145cac5c64558722f4944e422a03653039266c7fc +size 686877 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e7ad443e64..3ecc5159dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:767f74f3d72ba6ea9df2ded8941cee706cbef2380788264101b250fc43cbac1d -size 1073655 +oid sha256:05bc688c081accdc0cd510225bd2bb1060cf24b49ea1451456749c7c6beb3cc5 +size 968921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b7002093d..eb4b4b39b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:194b47c48197c448f37a057758d8541dccac62b192fe3eaaa6a04c43b21fdb2e -size 836551 +oid sha256:a254556880db6ba258875a3b9111f951790e6ca395dfba0baa88d678dc7cc3cb +size 862057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 6be8c9b6b2..ac055c24f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94fefb24403bfa00e93447d34f57d9c6f2edfc1f65e9bb1556ad328ab062ac29 -size 1044893 +oid sha256:734ceb2a888df6d135a21caa7f9748b7b3adbb5d81408d7815c3a1165ae8c3db +size 939417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 823bd24c58..ce9ed80617 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8780dbc05f642ba5b0bd18d8c92cd5f4a7653710926135ec98c5fd11dec23083 -size 804977 +oid sha256:0565763af403deb4ec4497936f524b970ec79a2db1a06c7565dbfe8b16d7eed8 +size 829791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 34dfc3e234..04a3ec57c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7595f1f986e0112b69e4290ae8a256203cc7702a1b73d7d60a9ede7ede85aed4 -size 1163563 +oid sha256:5e12e781eb9dc07da051eca7fbc38c69c7d978ad5d53b301d500ca32e0fa4b10 +size 1198837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 02cb7cc872..60cc24232b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00d2be7d605d51bfe03ecb9d54335cf7727d99d7f3d86f036996ed0b47b88044 -size 1081667 +oid sha256:eacff28ebd2bca80507848ff1d6b014822dd6906148cb0978ecae1a46df1b865 +size 1119703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 361b253fc0..aebc4c3812 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32213a92568373c383acfe19215b38cd78d614f71dfe745dbd4e7bafc44a61c4 -size 1081127 +oid sha256:376dd95043d31608b5d229447fd20a0284add73cf7dce87835e4532199ede308 +size 1054389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6dffb8becd..62df134d82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f402756395c9e4c196ff7d21b842b504d81fc7391f8af113ede1d65ea2dc797 -size 1092619 +oid sha256:a4f8ddfa1c027e8e91dc9bc89632aea0dae8dcd18ca9af589e9ab2f9689c6569 +size 1130755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 1f7d270231..cd738d455c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24ac0c5008d4fe1f492b7371efb79d87280524cb168bb8027142942441c29910 -size 1013635 +oid sha256:c97b12c6e62b44834a6a59043a4dbe9f8d47d4fae9f112a6403e17d4ce9f5f1d +size 1055223 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0efe39e463..5ec7863001 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3153ee9ccead06b2fa04f55856a34976c6ed3c99668c4cda918d7b21c59883ba -size 1151223 +oid sha256:9bcceed39b9057ff575b92a99511048018db4036cd4b62e932b4dfd7ec108f5d +size 1186349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8b37aed99f..6f4e01251f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b57b522fad128c3b1d3f8c2490f78d609e3e48603457505b568bbd6933c2ce53 -size 1069329 +oid sha256:f4b74c6a058b7b7357f3bc11e2d63d85908632d84c64eb10faef91c9525ac84d +size 1105637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 81cd2c5b1e..aa7e6bec73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e75d2ecd80525068b3220819b61b5e1e92b91fd7604ac2a04315ebfd25102db3 -size 968691 +oid sha256:f208c4ef2592a32e4e1a17ffb8e5a3b94090969830a6ca8c691e17bd6193c182 +size 941113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d0eb214994..68f76f9f8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cc7e78bb3e8f7d8bd527eabf6785dd6c2268c563a3251f1578201843ae9fb10 -size 1090393 +oid sha256:0ed995e7ff19f90610b4ba07e95a0eb754a41c404155006fd4d24d68c980c3c2 +size 1128577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f8781331d9..ec2cac1e12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:597d212ec982a16534732ee200bd637a93b09ed0cbdec933273e3f199c67d867 -size 1012199 +oid sha256:d7af20142fb910c1b567836a447e5aee4f60d359ddd04cc5f87fcafb8cf6492b +size 1052257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6d85ba337c..a3627804ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baf9477eba546cf581dd81c7b0b0a89e119073812f250e6d87edd7d2bf9cf8b8 -size 1299949 +oid sha256:6aac6ea83feb558471132e7a6593c7c3a82de68422af9c2c5bc9751de1dae3bd +size 1189789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b6ed9391fa..50748873cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eb57233ea7f9b759f00b8df803409f68b585df804e1dee5e059b94409e0d6b1 -size 1047109 +oid sha256:704821b6a0b5ba3db52a72b87d318cb31154adee5fd4288a2eb21dc78a3217fd +size 1081789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 19c6a55164..99d4750cac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c74febd2f07467f13226ddd9113b1d1a337a9fbc56e148a8e1d92e803e78f9c1 -size 1243017 +oid sha256:35c5e9c8e5933e179647aba0f93b7aa01d821f09ceb5d480694ab886ddf610a9 +size 1114947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 2354f84b42..910e564a1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f959de05a490bade72dc003b2e6f42c6374e6cf501b483c06d811f219258c782 -size 965213 +oid sha256:03de316a4511403265ecbeb093135ce1ba2f2c4c1f81d9cd6c26e2b36e567a37 +size 1003495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index e0410db547..ea3dc13b83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff7235df4cc3228a29e15683fead4c06d6edcd8fd4ba89cf0aa99f802614ae4d -size 1082735 +oid sha256:803aed879235381aea895f37264d025ad37edacc26ae49883a9b393424cfa26b +size 952741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 59d5dbef01..da09b187e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:689da56c8018763212beca88daee469a2f0f16d7746860a6d5837d78171f339d -size 872961 +oid sha256:2099adb277c0df55acc4d03a320b874b79d230d1c435bf279e064331d3630d64 +size 844645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2b8b545b5e..2f31e3d467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:861ff4284f6f859839cfefb2cb0c6a7ca6d0de501e85fd0daeb3c1ef7650d43f -size 1226835 +oid sha256:d1cce4fba022545c5d737b6018326336f4d72ad0fbffc03a8752048464e75d12 +size 1123927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 492e56179a..507c65e989 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5e26ff7dfe0ea1c70f50381fa52d8f04aa8da07ae07d72bf6f66a7ff477a636 -size 986229 +oid sha256:c112bd7990f121616a517005a6f813e3c134df7fe505a7acb71c904da152d5f2 +size 1021551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 614cf022cf..4288ef1d8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4eb83faa86c7c2129b61d063e85e482f66686fb314f06e8d49112bfcd24621cc -size 1172665 +oid sha256:0085a5eaf4c3dbe101ac11a69845c393d5312ef84ac11d1a49a97d6abc7ab952 +size 1052489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 8154be4b96..5f6b296c86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83d5daa7852afb053ad086df1e4c96683fc6965c847b4f6666ddc4d42a7d2f79 -size 906257 +oid sha256:689d459351fc6845ca264a0845073d30912f20100c3ec5451427db3ce19c8fdd +size 946809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 38345a221e..f10e6bc103 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17f9e5e4052051c68d82147dabd3fa6230868df24ba45e0db09259bb152311fb -size 1134161 +oid sha256:8ba5d7acd640331f9a61bc47ce18b064f850ed2814f0a8c6f04eb52346aaf27b +size 1149553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 09a25a7b72..ebf468c81f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6c184ff85826a5ba3452f661b4b71339ea7eaab393d89ddc195f3682efd87b8 -size 1066227 +oid sha256:c087851a6c3e171a2ab39e5761d6c3cc8fd05d70c05617367cf4d3634d6ecc4f +size 1093507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 76b41179ca..d889140047 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e526f1f5eea8dc3196a7e8b433b91d709d80488353458ca1487bae683afad124 -size 1051725 +oid sha256:8533440fab23d6a0b3c9cf41df5a38ed1a41d65782b1e866e8b85a4bd365c5dd +size 1005105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5c2a5d5344..8d3e0c7d33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f63900ed76bb18fd3515e46495873d09a62b79fd2dfb94fec8adc44f7ee5b688 -size 1063267 +oid sha256:73d3dba5f1494d370598533bd17abd9d712afac9df6b32c1543185cebf13213d +size 1081471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index a77f154c26..c13630f761 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1470d2bd58c49e9266c3a59609269d8ce82d1abe52e33487fb3a576cc0e4e19a -size 998193 +oid sha256:2cc7b3cf2756fab661fe23bf1b9ef197be66f19a26edae3f6ffabde67fd2f418 +size 1028237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 204950f5c2..f24aca9339 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73f9ef99791e7153927e3d8cb315b366d7ff8c4c0f33a0a0ed047c65ba964900 -size 1121871 +oid sha256:c1b3ab477c7a8f42f7f6c44eb216741dc4e7d4d71186b80b5d35a278778018c5 +size 1136227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 4b67d64f74..9e3f1dd91d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92dbaac0f8b69fd2ffdf24b40e35834330bf2b735bfefbd0d9235596b9d5095c -size 1053937 +oid sha256:9336cf1d2f4fc495af971a65188a6047f4cdc7056856026512d7f612a7cb313f +size 1081021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 41ee2574ed..d07fa10feb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b93e5a012c8a4901412cb42702635dd2c2f9e052f5de4c39d4fecf77743cc05f -size 939337 +oid sha256:f1c659b2b4eedcc6c046d47e4284ddeca25ffb75adcf8d3fecfcb05fa338eb6c +size 891829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4b6fae8753..62ddc1bf6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8555303e64117bc3252413950bcdf793b0aa2e3e1ed92624da96c7021e169647 -size 1060251 +oid sha256:8ef18cbd62f07a98c708bcfec1565f6f1150135d4b0ebf9490a8e07ff788f1ab +size 1078455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index a6759b5f0d..9b378b5eb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba4d14c9ea973a34c53c1bcfb001a4991e9e3b4053c0fff12228701698c663b1 -size 996757 +oid sha256:318a2b2437de706fe645771ad87a100de28bbf663a8ef225f1fcaa8aed98a500 +size 1026061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f4d552a1b3..48144fdeed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fc2d256a6c3833b84dfcc569f0789e554be40b8fceb1ca0dbdf9501533261af -size 1274543 +oid sha256:2c5f8dd3c332c1a228dfe69592715b96e3f934e1dae0a6190f4339751445f671 +size 1143613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 66c784d3e8..bcd0d05cf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49666fa188d7d2ef27e307b778e293b43c5f529bb5ce96ef12042323c900a15d -size 1016917 +oid sha256:660f9416419db43e022b7ae08a019fa42ef90aaf8a125282628d435ddada87c0 +size 1032505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ba2b931f4f..e60eed0944 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa5db65d448a861876d324a46c064db99d1877828da533010c65a04677afdd7f -size 1227427 +oid sha256:ed7c8f3669732c806e780616bcd62a94cbd9d74f74b7175ed13cd6de176e18b9 +size 1092501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3dec43d81d..b602b4e2f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bdb826b70723fad4b966687dea9c51bcce734d2dcbf69b0c4cfe0f306c14c66 -size 950611 +oid sha256:eee7e1eb2800c9f296030dd6c708951d49105a5db91edfb6c935717414dd888c +size 978139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 450fbc94e8..3a1289b399 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3aa37ebd64a995c5489cd88f1da48c63fcabd775dd48e5104522fb143ca4f947 -size 1057329 +oid sha256:da06d796b58b48289f4258d5b74dc0d7f3d102666cd12bca04cca65b73782c17 +size 905777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index ece426c777..8341f33899 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f6c26ea0c2ef4b8f659136390c4c3633d8d89b3a0c3c4217e36190098271bb1 -size 843559 +oid sha256:4220867befda136b3d7a50084aec54cae59fed4ea4ff17bbafe36204e45bb04a +size 795361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a1a6ba66a4..9b722382f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3ffd14210aff23e76bd3831bf43436d4e84f0ca6ebc3ea989ed17e46383cec1 -size 1200639 +oid sha256:2add840ce52761ad8347c7964f752af1598dc992866da7d9f322ab44eaa857f7 +size 1077751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba993225fb..d9bd445f9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6ced76790d359a9d3ce8a5f1a77b94a6e4623d1174e9b624f839f5e9f8d7931 -size 956087 +oid sha256:044a0952ad329999ce8bc173fab37e4d743b28397b937dfd159a52d4156ef051 +size 972267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 57cc996421..38787935f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51026e9b4bc0b8859ea8a333ddfa9dae01c596dee8c7630d782fdfadb4ee7962 -size 1156287 +oid sha256:3b346d087713be94b8a5fbc18c3ca5e9c15fb9d6d9046d3ba82bb3f47804fdba +size 1030043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 71e37ba4d2..75b3c88f46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:041d22dbd4f54ef6e7a0396db4817938cb0d2f40a7fa1955f02468018af40588 -size 891655 +oid sha256:3ddfe3c8479a124eedc20c52f6598b849c48aef29c20d1b0a3d0e385f1a13fa6 +size 920663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6fe74db818..c59c682a4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f07ee6475e836503ea0d3d893e6743f407e05ae2aa8b084a3806439fb973dd7 -size 1360625 +oid sha256:19fe3b4cee33d2d7e0a8ea30ffe73c9c2100fe685abc67032eb13593213c6458 +size 1003995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 0f94b00691..3dcc1ddc14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fff45977edecf32c3807f5885eb830bc187c597d81a001c9234b972609f476b6 -size 1243303 +oid sha256:70a769d83dcba0be3766d2ab1b3b30e806029b3cfcc24282bd7dace1317e4f46 +size 887165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 307aa3ea6f..ac730508c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aff16a646cd36d92eaa6846fc09e3315e4f61e11c368b3cdc4da90a10ed45348 -size 1355837 +oid sha256:e28760e70a77dc82505c994ad5bee68e435ee3258ee19736d06df64d434606d5 +size 1003449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 2e26dc9a2a..874068cb8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c92476ad3a5b1f2cdd62672369f5df34ba656656644c3cd82f3207f41bff3f2 -size 1232349 +oid sha256:4b622639d88bd525f97307ee81677d6d2e77b3b4df311f8900b1959df0fb47bb +size 895057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6fb46b32d6..ed33d1495f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abf4ae096c7b312f7756380ed0142fed7f9501482e73647774b82d7d8268fdc8 -size 1654387 +oid sha256:67e6efa664f106afd0eb265b61a6385a50c2b52a77b6f1541b90042af74463fe +size 1129481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1d8e12fe8a..7317c24c9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d70d58f985c151bb4a5a482aa31b15ee72371ab5d131608525600b35e07f783 -size 1537411 +oid sha256:e8d12aaf0dfafd6a9371d36dce96ad5ed9bca79c3b9b118d3b96a2265caf26fd +size 1021827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c068aef209..46d6c5cb40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1e5f718c5f84a066fec93a2b483f2677edfb39ea1367a8b6d946fb21fc38025 -size 1431913 +oid sha256:040f454feb7b354f73075a825b12517637fefa558873f352542bed5ac0fb2c89 +size 1008189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1a7bda2305..98295a55c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b7e35e38b5e998f30aebd6a1f08aef64c28023af80ef13a3ebe273831ff6b3b -size 1309757 +oid sha256:11d055aa0380ca809a3710bbe0113e4d6f7249ca110d29d3bbc59f2756de16b7 +size 964669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2ae283b3e6..fd38e6c76b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09682db7789f71e48c3786c680ffeba7cd7cb1ddc1eb32a459df4995261f2431 -size 1426241 +oid sha256:5669750712bd02cb4a25546477cb9cfba6cdbefd17cc8814b308291def702546 +size 998323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1c9733a107..a5378f2acb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62024fae6ccde67790fe2385b40caa6569f60316e63f3a941c66705c5c552b57 -size 1302603 +oid sha256:6abbefc771e9fc42b01b354db2f9c0eaf0fc8b52edab818ca12411bdad46aefa +size 954655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4b52c721eb..8573164401 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b18c0423b83ad82bd0263c8011ec6a13990f8d08093832af623abf7088b1a65 -size 805803 +oid sha256:44dba95ce591f879db028cb7d61bec93ef69f2db33688a2168514a9612cbe70e +size 799587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 39e43743da..1c39d7fa62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c8fdad6b59f52f0d9dc0bff0680b71fa4329fd760559173d902dc55a7c07969 -size 724401 +oid sha256:b868d36522fac2ab38649864350c346e30ec0e71cf6e0099f3b08fb1c3d51896 +size 723365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f748d4e259..ccc5d32005 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a7a90ec4808b3945c65f3375d8a51a15065b523ae8df17b77b6ca6e4fadcb69 -size 804219 +oid sha256:5b7027c359537ee3021b29901041218d92af9b5467e97877fd4a365566e8b8bd +size 797263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 120c552b2e..33ca6a15e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e339148440f86f0aa613e4c54b7cca8d1143d562aba57293164359863be4fb3e -size 749161 +oid sha256:1d2cd96aae90681acd509590c57731839638a555b926305b5f6c9d37fef12673 +size 744325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index fa9afc75e1..4e56b37e97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:835a6d8a9f77fcf56ba22fe349dd7f0f29e242e17b93a3166ed27068cc02250e -size 1497673 +oid sha256:ef4a9500aad861e0a60c43215ccf1e7ff00834bc1b147d1d7e1349daf2ab4b8c +size 1077451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index ac754574e4..6318df13fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ea715d9e4947a8ef057dbfd5d2d858dcd42b946041bcc04636eca81f7e95717 -size 1297371 +oid sha256:83f72f92583e68bd2bfaeb2de810b491323c45c95cbcc7f82256e92350bae825 +size 965161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 36f4052721..1a75f97186 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e40c4485f663f9b7bcc1d19b95b78f2a6152d18fc86d5da7fdae4f0f032df8f6 -size 858471 +oid sha256:759b5b9dd384d75237750dd203c71da9dcc707e90666cefa66e93520d2926da4 +size 806277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 23e0f72104..78f6518db4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bf6f664b1c35263d5e9fcf5f920aa1bf364359e205aaaf7c4809fb842d12573 -size 681405 +oid sha256:31936dd6253b8913a1d7e10c8a29e51951e45137d7a4e5a04a23939b1dd6ee30 +size 679481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f278814465..773c604cd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23afcc1d6b3d81f33fe6859a6259c5d886d19e035048cbcbd83493dc00e45105 -size 798183 +oid sha256:a550511867b9c1f5a6820bbf6834593534369fa57d145af9b05d4c43f21bdd4d +size 727835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 308ffb4535..575c5e9646 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c63ac7300c11e00a82c504761290da6baa3625dc7c790602edb6995bc60f65f6 -size 629999 +oid sha256:b813441c07db593ff0acf108af5348aa86dfc2f722c745f6fd4ab3c243d134df +size 627581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c59ba1bb47..762c9eead4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58a5a76cf0e139a40c4b95ef859c58d362c6c6adff3df729d3292c1f72a9f1cc -size 798649 +oid sha256:b3325543bc1d226c8faeede91c08c128bedc3c0dc451f5ccce2388839b784e7c +size 789523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e2e6361515..7c9c476d53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f6e3564b6b6f0b4351db0cd1a11cca7423b1e6b21fa4a0726af140a483755a0 -size 717247 +oid sha256:df3ece3a1016fe213d56482d80f988bf12f1c3db747e9e78c8180a4a9996dd1a +size 713301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 15c6a64173..41edc28e63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1be93c7215ad70199b98aa6ee825d5d0f590381944b678fb0272c76905fb1a4b -size 797065 +oid sha256:e58f402ef85af69acd9f4f67098d23446e9a1a625b7a861eab9d020282351bab +size 787199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 23f225b6a1..a3a91bd869 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a395383b592f5a34282bcb7e462df89d6da1224398def9f97cffb0bfbd284f66 -size 742007 +oid sha256:9f3efd75cbb1ed9cc2335c5b680176c7edabc88d072fa69d7e4e83ac9b4563ac +size 734213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8edb6ba301..ea73fa866f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e512bb2bed85eca16a0875655bacab7a3f1e5c57f37a84258da818cf9b7222a -size 1491407 +oid sha256:c95d65783524462e7154b49e8f1b8807e100a2738959d71c66d6a5c603a6e6d2 +size 1067683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0d4117ba43..6ec83f4d63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd6d7b5f7b0e65ddcb27df1568739995eedd3ecd63887b5f1eff9b286dcffce6 -size 1290219 +oid sha256:868592b19b594a40d6e808441993f3beee83c8513ea090d271b6508c7144bb34 +size 954999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4c3b9d0c49..3e1a601fd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e52787e24b6dc6d3709dbf8bd82e8332dcd9838550ee0d15c7fff53499d1cb0a -size 852797 +oid sha256:4865f9f29350e79a460abb3a7a71d4c8265a424442a6345544416b176fb8f79c +size 796557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bf9b564a10..b25fd01bac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11a1e311963089146d911f73453facb5c19aeba38d2122baff33bbf703830d23 -size 674253 +oid sha256:3bd8ed97d445cbc3516581a63b42d94515a03f6d07dc96c615e71f9cc76e5a99 +size 669417 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ccd3656db1..e65e1a995b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03c0f93253b7600e9a94c1518a4191947a70b630c4dc0f22f911161acb6c3e9c -size 792411 +oid sha256:25ece66369a8f0ecd1f5e96e35148ed7cdb7e7d57131cffc9ec71111c21e5b81 +size 717967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ea13aee73..d0803c71bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e2c40333b1f1f6d70d83a7204066fd952bd479812830ae0a3a336b811c62a51 -size 622845 +oid sha256:316c2ff684aafadadf31ac24497cfe03be41ce04c832c28d28d6fff2c5619868 +size 617466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b41bf548ce..312b974d97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a90da736706841aa99ea9c506b33a724767a8f4029cd8178bfb7da11ef967fe -size 837707 +oid sha256:cfa862da0808091492aad7d48cc8042c25b2a605ebcff3f6d795f9aafff74ef1 +size 808255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5bcecb984d..028fab856d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e70d2b8a535480cb34686dda71b60b99d45832ceeed25c9c24bbc0791b9b83f -size 751273 +oid sha256:16941f240e8a885a5cbd53b452af1fdcf74acae3066d663a2646c3de01fff5d3 +size 744119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 121e3be093..ceb58c769f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:742a4e049ce08354567aba6000b62987ed659f383e339e57a56e96efbf931aa7 -size 836911 +oid sha256:3f9b823c1ecf07859489b1635306734709007a6e27b260a7d9dc52a9583a983a +size 806719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d2ad729b7d..2946db46df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5485c3890b13a0938d93fe5c259ed2121078c1578c69e5f6ac15ef1338c94f01 -size 778695 +oid sha256:3b3fa7da2bcd66f7c116c23dd59d83344e15cdc1d74ca3558a1e00f5f1eba4d6 +size 749787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4ba04ed099..6c6018a33a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:280b92067d453f7421817e617491ef02eede324614ed12fb71ba80bf0ce7922e -size 1726465 +oid sha256:4532bf9c1e3c3e1ce84fbbd33c4f7697df0718019c07a3371b1ea9c5a41567eb +size 1165003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 197da0765e..44ae2a347e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eff8c99e2b892889063c48260af139669950575f1ad48ba99ca20e4682e34765 -size 1604357 +oid sha256:7030c744a694ae0f98d64e8bdf99dcc07d9afc02a608e9c8786f690b133e1f91 +size 1094201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d56d2fe63f..20ef0972e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06691acee3d9026b29a32e4b89dbc3d7801c305be31a448a59cfb91e654fd27b -size 883419 +oid sha256:930a914c489284a9e496f121002acd3b8cd1f90ef774f7bd2e1a82451e4896ef +size 785541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 45e3474b02..5f8d231754 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e932f6cd20f22919ccdc2ab7e7852e542fc6e17d8d2513aa414d3d1e8eeb1722 -size 699841 +oid sha256:212645b3bb50521422538c747db8b53ab7813c623b5b0478c98a86037717a991 +size 688593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 84a2a44c64..8e5acfddbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a93c951c18a41b65c6493827fa1dbb2479d8af7215662f7289b4a3a8e0819f6 -size 821997 +oid sha256:b0d3366cd72ac1049a8f1662daf90e570b90cf3a9b6dad3656807a0758d49607 +size 722491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 63569856d5..036c8da2fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:619d0d3a07065c225d69422e2e03f444da9e13575a1f3af62a7aa1ac82f411ae -size 645325 +oid sha256:54265e236646d7e6a2e2bea24a5c438231c5337536b0025e6fd0c686d0e7ec15 +size 635213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2857601a10..8e80a5e1d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1062506ff4bb7396fe791c2ad65f3c17638618636ad8e5489837d7a4461a90cf -size 830553 +oid sha256:f7964dcba368bd999ac613514e156cd6257497e42f057e7bc25f9f2e6785690a +size 798191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fc1dd46598..40573dae25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:636e8a6a6a56470b5e0ef51be61412b541df292583d7c773699c0fa0904e6708 -size 743329 +oid sha256:bfb7efc036318c6426f4d29a0b0689397780a8334c07320576669e7fa9ffc1e0 +size 734055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d29e8e0f02..8dcf7b31f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50f3f657ceb8f6ff5d6ab8856e74a547320c03125429277ec1831b9234c8068a -size 829807 +oid sha256:4124ffe48abb709f7358581d6be4abdf7ccb73c469f29092c0b44e7b00607b1d +size 796605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 82693e6f39..90f98b865a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a3d7bb4c47b171bb339e3700c990666495706726a8010d14380190ef594b349 -size 771543 +oid sha256:b4d65992396fee6311f22d13f3b48f42719894fdbb5c340e7e70322eb2252f15 +size 739723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6bc29ff4b5..5b99e49109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3211205b8c07d993afd0b2ad2e7f517bad39cac5c18256ab404b77407c28c58f -size 1720793 +oid sha256:7a1d7a891113bcc225001fc4d8c56e76b57d2390b7b89aa5cef884b8e277a08c +size 1155285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2ad2e45dfd..1f213a2025 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f225d1f82a36a1be778789b2e615972c67a94fbbd785948e92a77bf24a0d238 -size 1597205 +oid sha256:79c8154742c1519856a0c1ff4496e7ecf809fc6b71fd599d0675506a7ee26517 +size 1084187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 834666b06c..f6240f4ea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7d3c6186c40c261eb8157ab10b76a43f491a1e6cc1791abf62fd47bf307da32 -size 877005 +oid sha256:a6a802b0a6ab56b22398dfcf78a23230c5a66e3dbf28a98e2be95ff567f0810a +size 775823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b7dc4edef7..92ca78c494 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6b8def050ea38d75c13c2720d68677d6d21a0e403fcb5f7ec264d4fd9486eaa -size 692687 +oid sha256:054362a7c6a6e3234acb3a12bdcd35e02dd854b16c6733c35f302959177fb8b9 +size 678529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 026e4a47f4..b44cfa8f9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba7a4eacd9c968aab0971b4001f8d3e6d5146081d4ac90261d2293eb13e95447 -size 815435 +oid sha256:332ab99d3eb6488315889f6afcbbc56fd4ffb6cb378d9f7c379cf5ee6845bed4 +size 712773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7ecb01c339..955c4c96f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da89f90a9d4882cd4c313cbc3c0b9606cef8e5e6d6f39ebe0bab024224574b50 -size 638173 +oid sha256:6282c4cedb60f2e38ffe3766c4fbb4fa34cd8c4e6ee2ae8a0086156a79a4eab6 +size 625099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e469c215b9..36b44a9a06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f26af172eeae680542ab5f83f334f24152c2ca4630fdf1b4c539bf4962ecb9 -size 969017 +oid sha256:663d189470dbc5d39a10e1602ee986c3fcc56eb5747fd129dbaf95758d9c8f70 +size 793095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 281628f2c3..2f8464599e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5992008ca87028fce950dc645fa05667b3fd0a07ae72cbd95afd1d07cd0380d -size 878187 +oid sha256:68c5aca6f97732b7976e626de7e7658b4a8bd02e65dadaf385e622701431a047 +size 704089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c77f167f4f..91995a0dcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c0cde0f6e7738e2ba48635d5c566a9879251ed43b841be1cdd0dbb4d9ebf6af -size 968521 +oid sha256:e21140f8d5a713b20eebe129ef6a6711f60595417c867ae51367cdc807ebc2c7 +size 792501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index d4fc98d887..4bc6396293 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dad78ce88acabac153d4b0af3301540674d8802a84a8c81813df0e7544d528e -size 877543 +oid sha256:1ac454888b3b856785cadc50d3b19d92417244cea0c108889c1edb2f9760b120 +size 708181 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e595f2b0bd..1571f26e95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:937f2d0e0e7bef88fb5abd23422497544b7ba900f4016b53cded002a64d53e0f -size 1102693 +oid sha256:646cc39e735bc9604e0a6170a42d9e3676693bafec2d6fa6bfee5e8b0d80bcd4 +size 861305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8529b10fda..d51379404c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b929a8da58859603a69de2d725cfed50b725a63a26679b285298dfc632d8d3a -size 1026859 +oid sha256:d90f8a3ec55f3c29786c50ab64a65ea665e89976708806c3ecdecfcd4f77b2a6 +size 770721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f576c24620..c587363642 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10a6ab1dcccdb21b061cd782938b75e07fd374820188b0d4e5f8211b47c521f1 -size 1050715 +oid sha256:7f48747862f12d3f5093193aba80f9872608ccca79788fee14c03b6e7f5bb200 +size 829209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e9e13c6b6f..92b03a85d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc6bcdfda59076a692d7522734a0c95ac6f99b9ca9959549fbdacdcadb5408d2 -size 951449 +oid sha256:19ececd377cb8bdd40193047f617474addcbbcbf13ce798c6d8d12899e8e2de1 +size 790869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e68b1af2c4..365187602c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdb8ff44a136c378ac6079752ef3b196b56c77d929f43f393d0afda9a5685cea -size 1038777 +oid sha256:fc3e3547e30887ad3dc4f1b627f65b27d54cf0e8f71a28eeccfa9e1f2c660fa6 +size 810561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 37e86c96b6..e96812c092 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e5dda08fc71786ef0a3b2597cb3d785a88d3e1098ad59717dbb502ea02ba36a -size 937141 +oid sha256:a6abbeef2533dd10d26c5b3f53bb15804a3d48040a8fcda2c6b813d32ad102d0 +size 769951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ef4c0cfd7..7d7247d057 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76de90b0d8ef5b967be93e831a5a3e8388ca6e0401f576e2f4d0f730b7cf66da -size 914781 +oid sha256:0abfb485b007c2df472fb656ec214e54a00b5e3c5f9fc99d6bb69bec6141208e +size 928791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 928aa9ec7c..b13ab0c4ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:098dfc664aa0423a0e252f88c5688533fa3e821fd70954ae78bffb36cfc2325e -size 839051 +oid sha256:2c8969f5d46488764b10b0c2275a20c717b96f6ca6fb787f1d66a31cfe047faf +size 853259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 80ad9e57f3..bdedec8023 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13e417ffbbcc32cae824831114a02a1cac10e6cfe28bed3e978483eefccb3fe0 -size 909199 +oid sha256:c9d4c3cf04bc0184e45ab7af734fe46d10ab4f3deb72e208be2c632cc4c56fd2 +size 923259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c959013a80..0b9aa1605b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:760612fb2195393fc95b6f330b88f18ca6be8fe4d8c66c67f5e9e52a5cafb1de -size 859025 +oid sha256:f1432fa62172ee1c6b70f94e79c447014f5152dab4e396660d689609f7d7fe90 +size 875157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f94a8ff776..4ae76e3194 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8efa69558f227e58695207ebe32eae5705255d516bfcc55170e0c7def727dd81 -size 1119977 +oid sha256:9dfcc3c947548f754295e686970a7a50d7f58e810005e9cea0403c02d6fcd9d2 +size 899951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4316834717..6bf0bbc484 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbb8194c99faed35f8bbe40514cd1194a4118822f9473cd48d931fb592bf076d -size 951447 +oid sha256:446e9e453f5c574631f62cbc8fffbe9b590e7bcd00ad9fd148dc58306d332a15 +size 795799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9a84462a43..ac66f601ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9462ebe3ca036101a1ca06d8162cf10612e3bbea2b4e57258437411dcb3ae6e2 -size 1016633 +oid sha256:140eace9e5287a46083bdcb957a8735cd95c9063e8532d4ed1eb45b6ad58cf2f +size 947913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e743c2c148..d064f0c408 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e42cbfc152a979537e1fdc870a441e953bc24a756da9329b7e41254d9490da6 -size 814705 +oid sha256:0e23e23ad5b08b4057f31f3f7d13dc5c5b1fcc65e059638a23af0dc2619664be +size 828321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2d9221395c..50390c28e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bf0d3f52efe432177f75003ec0e08fd1cb6d9ac4ddaca79fc65c4a4d393eee7 -size 941399 +oid sha256:dace8b294d03d4302f64fc6eb24ce43c2044311a7d692449eb7cc29b4366d1ed +size 849983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a905736047..dfd9f57f86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:234cbf6b10f31f054dc1bf3e6d01f6bf86d150680fa7da55a28c26faf4e12eb4 -size 739815 +oid sha256:a14427bb35d0f0e464105bde7678e7a87c0549bf4c9491bddb648379fb2dfd56 +size 751851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4391baa81d..0695d3a910 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b889aa0c209070928fdcddb4e7470bd4dd51f91e9ec16e5a683507988487979 -size 902053 +oid sha256:71c2516ba296cc4809d3dcfc9e018f7484630ed600783c0e122c930cacc42787 +size 907085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 091e3561bc..2a1c12f8d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7daf0110428f55ad3c1c52668275d9f6f772dcec048807f194506259aead0362 -size 826323 +oid sha256:72ed424f46e13dbc8bccad7654d0480e9530ca4d136463b156c884a3a42488e1 +size 832293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b5140f2772..54a9438f42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdeec5701dd23687983269747e506dd982086157a2c2790ae0579d53e5dfda20 -size 896521 +oid sha256:0663e6afc421653a43ff8b49518792b208fff3d981b6c133581b78a298f018d0 +size 901503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d5fbeb67d5..a939271855 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95e293ae7648f1bf79c75fa5f928bed2167e30cdb94e396a962427b524714e2d -size 846297 +oid sha256:4b6f830ead267f4761c8da15c81d5e00868b4e77d66dda5a0163fde8bbe23982 +size 853401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 384d378894..3bc7517f98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f20f0a2c5d8b661db0540110e6246c4390dc5d43882be123020ba8dabf596f0 -size 1108039 +oid sha256:2e9ef82bea8677dee35c07a43cb56f89c9f6954369c96c7f2661d67f3c79c3fe +size 880463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index df9523a9b5..737d59659b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8b6ef7d30876ac245716ea49f6eea3a6ef8b6553e2ae5884acaeb23cb6ce174 -size 937189 +oid sha256:466eb25c829d8ab3c589bf78e658137d02d8fa2475b093c424f4786cfad3634c +size 775671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 34e6f9507d..6d9478dd11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:551deade8b74d1a5d8363231406856836362948f53d02ce186c50ddd5a669b26 -size 1004547 +oid sha256:5193f96e93dbf5630e0ab4a3054fb6a4a90ce16525c978f0fc6faa74f916a494 +size 928425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 75e999d6dd..37dffd3764 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82b23b8ce9624cd179012edd676d2aec3df155cee71bb020f419c9b656924bbc -size 787769 +oid sha256:514e2ad7c408b6bc4329e53918b67075f3e114fddea53eed7ceb8e8c6338fce7 +size 806565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6161452cca..ee2701981d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23efefdab554df70d024de04bc61b5a1d04294a7ed35dcfd3ee837c7a22b6378 -size 929607 +oid sha256:93190fd411d83ae84aeede7d506a01605c7c9cebf11d11abea983746e8a9b286 +size 831187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 19d8c24d0b..bbd6adc1e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f99b03f285ee0ef3e8aa0244825a3629d98df37a963f5c63f14742856c4d53b6 -size 727087 +oid sha256:6250e04bd9c47fc0ad5c6be4a6bb78007f8bb0702d40bf737314c002d5fd0bca +size 730885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cf73f5ad2d..9595d32af1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cefd9e8c6850551b9a7dd1750353efbf897bdf03bb5f6bc149b448fa1b6e3c09 -size 949051 +oid sha256:ffa84a635884858d47c48f5d12aa56a9ea6fd245abe863106164bed122330677 +size 938247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1ff3bab2ab..e76025871f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b0887dd239b02f949752ef1802f2cba379ec0b6514ae3411fe84f047ece53a1 -size 866959 +oid sha256:70e5b1eec62c3f33e8ebdf92b0aba47ea36ba55dc4679ed7289c206a25eb080d +size 880773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 65d0b3a870..c47fbea11a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1773b937d5ae3f610df91ecbb6726f0bfb5afd1486a9093c1191b98366400c92 -size 942731 +oid sha256:04e39e15a637ca7c2dae3539f5ab876a3044009e55676d8ca9cbfa008fdcd717 +size 932717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1d33f7c2ff..cf9b765294 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1edeb94193197608c6cab91eef14282216227b7c28a1f84a7df5fc7cd8745ea -size 890139 +oid sha256:3965bf65c8f94ce77210ff35f0aec46935ea361311ad1903394deda2faa48090 +size 882197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7e146a707a..213901b6ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1053b61a2df05f8f8dd93bac7f99f069532784aebefdd42b50f84889418c7f3c -size 1183651 +oid sha256:5d6a41b56c530e648a4c707f9023c60fcb72e89367561b79d3d56ad9b0d6d260 +size 914241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 075582e6ea..32b595aa53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:578a310bb6e1bdf862c2941ab942bdf9554178c110cae6c9efc9bbb0795d3f17 -size 1100911 +oid sha256:3ad52897a396c587ea4a41566c8df7844069d0e54d9e04c75c5bc8920bcfb3b3 +size 858981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8dd839d8a4..36dd702ed4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f645aafd1246faf0c55f59971a4b423b02af677d63711de8fe26b3d69770dd56 -size 1039163 +oid sha256:ddcb047a06e75da2106c36dc476daca4954e84beff7de130ed1393db7d69a500 +size 927127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a4d6c55a65..9af2e61be0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:feb89a5ac8fd9c9ef75db88b1af0cea8a149208a2e30995dcfd94b731776e0a5 -size 813455 +oid sha256:8bf5f45869e537c8e9c8e440df09eb2810afdc1d4c72f534e6ef16ccabf41f0e +size 824655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8380ea7e05..14c329af33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4d530443efbe28cf6465e82df45cbd684fcc6d63d31f4a5e6776a268ca50988 -size 965409 +oid sha256:f47dddd56712ebe35d0238021a77006f9ffffe0493ba684a86a523a1c63f9d11 +size 853915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d87035df61..4e02b1c6ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:337ca018bc621c88fb3f8ec5ab3e9eb8036c241f502b63c00771c94f99a1a184 -size 748137 +oid sha256:6145ac73bfba22d3469f66c25d9cc30ca7884fb10c779fb34a2fe74d912c617b +size 761505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2b495deaa..7fc57b5720 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93e069f9aa79c522314336480af28977212523369e9f4b6a4c5132be875cbf30 -size 936373 +oid sha256:fd01ba7faf1ebe668a1781d141de49c23a57594c6ab18b80904f6afa72d99190 +size 916491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 171c55da70..9ccc3a9cd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1976fe6403f458b7e42345454d0301cc8bb344178e3d23ea70f915f553221e27 -size 854231 +oid sha256:4649fa53fcfd1eafb1b7da119cb47e36b0e77a018e7ddaa65be22858778c0052 +size 859017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 702c384080..4953133da1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:546a60d7705bd83f13042d7f09d4891e8fdaff9f1970b44674279304b96277a0 -size 929213 +oid sha256:388399894df092d773f0249455247eb3c28524821f9468cd56a940c43338261e +size 910961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8bb25d0e4f..b5b2027696 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:167c893c5c1214113343cf7ec20afdb1fcd0e5c821a730a0264484eb03ae96d1 -size 877411 +oid sha256:78a2426fbcb38f33c1c5fd2212442405af12decd0590b359e3f87c41f2edf15c +size 860441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d0561b970a..98f396017b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddde9e20c92e6a1941751e918d0a55a3c21e5d733e36bc10a392ccd8e1dd5c0b -size 1171761 +oid sha256:99159d9c0d2abb01d989cdd65d219d125a6ccfcccf07ae5f210c5798aed6358d +size 894755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c8263496c5..954c3655ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9deaadcb9448aecbf1ca3ade10962054fe4597e589cbf17230ecb1ae22f0a35 -size 1086653 +oid sha256:0053c5350f7cc475609b40eed5aa4d6c75c225e130b94b89aaf1f5135f314321 +size 838853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4801141745..76eec22ace 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:826a7fe01d1c6db3c35de62e8061c864b510e99e99e2eff77d3a5cfef8f6f603 -size 1027027 +oid sha256:68a39a816e3ff903d6868e079aabcb54a668766632f9efde5f70920a3dd26324 +size 907691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 99b8087bb0..ee6b84dc84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23c03f688cbea0097d91e3be1415b39f5780ea3bcdc3753a96388c187d7eb366 -size 800777 +oid sha256:3d4a6cbe995099c2f823ba4b298da58a72db4ba3d69c6c4537e2037f9de90ada +size 802109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4796ba6149..617a7304ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1efcf2b9852b8e6f4634bc535cc60324598356fc56d57f26bf8cea7c2b260b0 -size 953569 +oid sha256:511c911e50eb62320dc81effd0ca0b00133719124b856e4f935f9978e2022f4a +size 834429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d651a0f66..b42a151ce9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:750acb94f19b8963f4153aa0f4a3a7d922bf30d450f069502f15a7397c534999 -size 735457 +oid sha256:86748a3f0acc833f2c7cda8077759683aadafe5132cd55641b541687be0467ae +size 738961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 421313dbd1..81d7ea4dc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac8256b81650ee3ca52c7730f7bafc508cf64981e76609e84af1887bb128b29f -size 1274487 +oid sha256:1b4ead35c9da2daaaf299c4f4089b73a5379740697b18c2c9f59562333953def +size 909865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 83d844a88b..82bd74ce84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a51e44f4f2118cb0932ba18afceddab9c78aecec37e58a8ffcc02c857ba6b1cc -size 1156671 +oid sha256:d4c06aa79ede161f3b2637ce7c3acbdfd0f9870a3c6d5fc9604f709fa4d0adca +size 794959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 386126732b..d9e23f54f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64f74d83d192d81d1c114c25554d5f56eb9cc7740cc94a5cce7111668c9de170 -size 1270687 +oid sha256:25dfb5a9a42e640cee366a60a46a8365cc8ba8800bb9da42b94c1c40f96b7e78 +size 910109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 310c19fc0a..7769ee4b27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c2f537379dde46c796e3ec45e85405fd3dde6c8a74f56c98ef03b4c0e11fff4 -size 1146013 +oid sha256:e3adb6f5910f58f9882be393e9f315d8d5bed4036c59aa841321fab684364b41 +size 802455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 60128e37fa..0a182a2dd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d9017b2d8723f4a95dc241b5b24eff7fa953440186073b5324a863dbf07bc4b -size 1567411 +oid sha256:3126c5c4f85a16d1809c4d0cdf1384e776052c342cdd8c79af5471d42494474e +size 1042011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7e3a791bce..2be4f09444 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d5079ba3935baa8ce799c37123a527d97e1482edff376a3ccaef278879c875e -size 1449201 +oid sha256:3eedb21097de8f27b693a729445920a9a6061d258792ea0e65f50205093d94ec +size 930411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 58e2bc6e1c..24d9b11fdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25442c8add70c7d7c98497e5a73a791ac5ee8aa5954a1a60b76c4b4a7702143c -size 1330235 +oid sha256:ea48cdba998c569bd3eeafbeb752d5fa81c070d56c6ea82d694b7492b003aa5e +size 914849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4c343fa867..cadbd02a86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c1dcba93a678787567be3dc9398db05a56c18b06a1714ff3671cdac8d80e567 -size 1222879 +oid sha256:26b1c3add5371aec32bc6827244248c7299ee9e5a7979377b5318932c41fc4c7 +size 872365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5d4624ed89..4f04dd1801 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62ca4dd3f94d9783abc843d8ddb0212107abd1ae51d315288e337471c187397f -size 1324611 +oid sha256:1485cab1d7c14ca02d165563c068c412ef306ccdc3842b952cf8af0f92f88c28 +size 905031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index aeb677765a..6a34435137 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d960e3f2694f2fcc553257d6696006b09eaa65340c342fbef09226ccdc0126e -size 1215725 +oid sha256:c1cf2c235d5f9d272f23a02c98358a97e5a6f070537dcbbb64777a78a9706ae7 +size 862351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a6c4e250e0..9513cd1ec0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a23b11615a1a74c16ecaf0ed69d0803d36810e8289ad9d5d37e0deb4f90f19b5 -size 754543 +oid sha256:af7c79b48875bd927a8a42a19101f58360211551a941d31a1f18f53594353bc5 +size 757553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a9b851cd6..01f421259e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb8499b61d454a9c2ec9ff30022e1f4f3728072be4026ca11c5f0953852cddc6 -size 716901 +oid sha256:64b88e73f02fad6e4431aa57353c277e558b317e723f4252052fd5476de8123b +size 722771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 64f7a7a1d2..e94947eda3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7311278478bc7bfdac24fb25b9f83612222e57996089b4e53e50d49daddcffa -size 775357 +oid sha256:c3ed110ab254df7929cc2abf0969de74de1250a928d28e9de15125b21f8a009e +size 776837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3f5e01a62f..f41370b798 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6641a7bc9fb3e207d34a466ad1fbf0cca699ea0767839679937aaa99c700102a -size 736973 +oid sha256:e1d6c374299163b9235edd7672b24fc10d19094d579e1ce4c6cb8a99c542d6dc +size 740525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b4d49c61de..b7503ea8fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1ce0ae2afa9e33d8c690abdc17c5f3183eb6174e15a91e6bc87e1b0ce586567 -size 1396833 +oid sha256:256f09a9d56faeac8b756fea6ed8299e5f7dbfe5519958ff3c20ea1b58420c23 +size 983321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d6b1563966..19385188c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:497636457453c192b11f145ec88250e39d1e637e70b9dc34c671f56aece3cd2e -size 1210789 +oid sha256:38400d040eccd6f6aebd0aa345cf3e33d2bf4b4709b1eac1ea9e9c76858e1164 +size 873005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1338937259..c37f59ff52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7ec57064ff3570bee73c7a38fbcb6b5b3cf7d39d06fba6bd6eb5a1d11be422f -size 827487 +oid sha256:1cabdec55ae5734ad4a123bd69a9efaf5b1f7b7132fa8ee87de3cc7cd28d2f83 +size 784025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ac6c78d5b6..004f9d3c32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b901928f63db1c4ae10ab94a0c28103fbdaf8b3aeb5c1eee48c4394dbef34b3 -size 658463 +oid sha256:327e4371bcd27712a569802ce17d89558deaebc403c9c74550626e6004eeba12 +size 663643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6798fd8976..60f82e84ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c5c007a15c406e4123dc0f5782c784dfa5daa58ac11d5f270327f4862391587 -size 786095 +oid sha256:36c79d5dba75d6a1e55b86cdcafa9975fc3cde7bdb3d1283b8054013b3d09864 +size 724971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 883ab3d144..b092b32079 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6eedf1ca96eab391e3475242e96b24c6c14fc4e072693430f5bec603b04e673 -size 620869 +oid sha256:fedeafa0f8c1cf7fc920d6754b52324d90a32188236fc72b3faaaa3a2aef6faf +size 625359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e0e5a8dbad..2a9637daff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41d9379f220dfac66833c427f09302be741ca9ca8d8b130ee8f68d529087eed1 -size 747391 +oid sha256:5400301a8917062ef6387080aa332f896c62a7336e6c7c2681eed9c1b3f97f37 +size 747439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9a004e04fc..72337486ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce36adb30d0d987b8aa87e594459d324883f90063405a03d4a4175fdc2b1a5fe -size 709747 +oid sha256:673d7728996e2c34ea11f6ebfcb5f8bf30ed57592f157d670676e976228d994c +size 711917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fed122ef33..926ff351a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11aedcac7c56701e7b90f1b7252dd9e19e787713b4fd520a755a3da5ca678e61 -size 768203 +oid sha256:b164243e7f5583e90951fb0ff2a18523621f6bfe87c7e65b57ce66d24a70f66d +size 765983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3acd2b57c9..836201ce41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc8805b581fa8cde6264ab3c9dda88dca4e48c41fac1b2ba8390daf067e3d4f0 -size 729819 +oid sha256:f6dd17d0554e38483933d793f0b55674d177438af76531e645a3ea23fcb3759f +size 730461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 37724c4c5c..21ddd8e8d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8d296ba0f15ddaa3832816f7d4d82c020febcc9b39c83036c2d4e51afd9f9b5 -size 1390469 +oid sha256:4a9d5b34c5e7563798d536f88dac96242d535347ad3b6924b07903e13cc5d411 +size 974393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index b3147fa9ca..ea2e3e0d62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a18c3b5429d4c0982d28a5904ad07a4af22cf494514c7336c19f9f0fef7461de -size 1203637 +oid sha256:abd40b777a180b89b9055849a82809112d915a9252ffb442321921094463521c +size 862891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7ec278f3e0..99bbefa65d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75142518fb23ca9c67fc6c22b4f387fb43cff98bee478b9efbb9b5eb9edca1ed -size 821913 +oid sha256:62c94bd78be4956f419887bed4a5719ce2eb9cf76ae6d15c2a1ce383b6011d0a +size 774257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b4920a3a5..cac55949b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80e80e9fb5ef8fb83da048f1745cfbd78d48cb7d483d61d06a1f3424ec5fe36c -size 651311 +oid sha256:69e4843f0b11b5182abf649d61cf2fdb9dd777b8a56119b6dbc090067329fd30 +size 653579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3353041352..d11bd280a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa428f5397b2adf9b9e36d3577e3eb9f2493cf7c39cb01e0715c25af79911cf3 -size 780323 +oid sha256:5c2c949ff9900d799ba85530cbe9472bac6d3ab258ad2df0bce29e8cf0bcbbf4 +size 715253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7a1e5fb14c..4af287a89d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a137cd33a121b46e95bae3ab06cacd5b5978f702342f6bd8c8e5e16b094458a -size 613716 +oid sha256:d4363cdca495c1e4ab08ec27ed56cee97a6f263374c17332f6e7f101803da6fe +size 614504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1e7a5fdfdd..001c061e98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:821d61b44717e470df683e5e15619f16ff1c4f3410b8dceb5f1871a1d36e1d1c -size 782993 +oid sha256:349eab2050d7dab145576a9d155b48aeafddee7927afb17720ac266853713b09 +size 779885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0148d0fff3..7d9ee65992 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:193a51b72d2456c5628fc695560ead2d80586cba6f5cfc9373612f5f3c52dc38 -size 743771 +oid sha256:fc2b739a9c52e322be0603f70ccd736c7b5c9e38880da7210e8b06945267ab5e +size 743525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9625e42b42..f2d77ce0b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:426dfd9aa2348a4b734b98e54c67596547d2dc006ed391215981af9e72258eb6 -size 807259 +oid sha256:86352cf27ac9049a9c2e2c3bcfa46850e7ab56aceca9347df3023f757f36f68f +size 785503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 36490351a0..4ed5e43874 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82f49aca14401acae4332d3183ad9051178ebb1c09ce8feece9d09308031b6ba -size 767297 +oid sha256:f0d0378361ce40d806f636a9cf49f0e8e8e2fada5bb39a2f52870239ca4b0b6a +size 746825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 60cb856277..fe2e188a62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89e3219f0607f30b29639dad981083c3ffe7af5501961cf47541de9089473705 -size 1623209 +oid sha256:161d014e8cd8ff47277dcee8e0576856935819b199de9e15e077602952bd7fa4 +size 1077335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 05e3065eec..d5513bbd38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81e17eb8e82b1a3c8d5b5f8b1cc2d09d034f2ce743dfe150ebd55a77d313b554 -size 1515111 +oid sha256:040546c0e85195e3132776a24ffe9900d2ba8f9445bbb0cc94c0e8e8f2cf59c7 +size 1001897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e707e95db1..bdc992cc22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7606beabb70db26e99bd8ab9f1eb56258d13ba7d54d7561601cbfb4fa0397bb -size 852485 +oid sha256:e77f37d7b62664a59ab3ce7df5943437901751e6b62fe82c1618fbae3fba98fa +size 763241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7c56894bd3..d3f51634ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5230ed5bed60b5182ad5d6546a85137012128808d278727198dc4b061c23008 -size 677787 +oid sha256:7a38929b26bc850d57e1d7dd6cd1ab1e58ceca4e869fd7d0929b78323c8ca9c6 +size 671867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index be4a89bd4b..12e31d48f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51cdfa31ec0a6be1044401c045b4f80fa58989cfc44fa730630079c698aa23f1 -size 810647 +oid sha256:9a9af1349797e71e78d33da25173928e32672516594e1c60b0c74945ffc520c2 +size 719677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 14a3c082f9..272230faf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b2da9153dbf254e7676660783df62b1a7fadbf0e2b36c89df078f678d67ad07 -size 635407 +oid sha256:b6ca4aa5ce51b5bf25a8449793bcc582f7b525675ef381e532c17d90c0e3ada8 +size 632991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7c35491a6f..4810bcc87f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5036fd35c8a23e179b8a1be2c25f066140be1b0c64d1d474a0ccc60edb766967 -size 775889 +oid sha256:b8a862afac6941a3355f9642a765420ed1fa6840759719f65d120968fecd8772 +size 769821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cd04765621..c842aa1bcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49100ffa77ee52a9b5e648751bc65e5fc47087b35670ef3f05555eebc8fd4eae -size 736619 +oid sha256:424f0d457d51b2bb1b3bb9bec61f9eda50c783600464511fdbc88e19e7639a1c +size 732671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4463b268f7..528ad03411 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a218fd6f93a6103303f80cb50f5bb84de9b0c180bea31fe0f4b15024a0dcfe6 -size 800107 +oid sha256:06d25f0673fcc42f792019960afb0c672e21e06f87f694936f0b3762a2299912 +size 775391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 42d728fec3..1ca36fedd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edf47072bac7a563da80419841a678cf6e97b4673e13a6cf39372e32dd0ec877 -size 760145 +oid sha256:dee5c7ee1ca2b28e915575f26a555580836e6ebbd9eca9bb3249615eb1351312 +size 736761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a967adc731..00dad5ff6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea8e760d99b256d8531f6b244155e26c29137804bc3dad396a4d32a726d39ffb -size 1617585 +oid sha256:70ffdb1484ab5b6497a0489416652e99d8716223752f37f0788090d8c273dfe4 +size 1067617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c3e164237c..92eafd6cd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5c5c36e26fe54b5157a887279b9494b1fc1fbdca00ad47fc2839ff739e7ce4e -size 1507959 +oid sha256:4f5ad3e40af7662d7f54f911027fcddc508859b289dfc2a36efc4fc728b25d14 +size 991833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index efbc79bab4..cae60c9752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfe3f3219e4b8fc969ed65803bfdc7769521386fb360558fdc0214d5c9bc78e5 -size 846121 +oid sha256:a6281275f4e82cf2d7eb372d60e1c8179cbbf2dd936511f480737ae66c9f6b9f +size 753521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 94170a5830..d2969cf5e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49bb4a258862e82d43e4b426ca1313926e6b8a5d36c57bf126b0914b1d766cd5 -size 670633 +oid sha256:bdf1f6418f430f5d6f1081f8bab35b38c50a598f144874e8690e382c5a8a4786 +size 661803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index acd16d66e8..973af1cbb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97ba5f8f73c236deba0280c9f60118a0607b5a88f8efbf1e1365ce39eb02e9d8 -size 804875 +oid sha256:e8820586bb76db17e4a265884de6a0294c6364dc32934bb90491c0390fce7ee5 +size 709959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3e2ba74eb3..58bb247d74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80d03d32045abdfd969aa2b599ed1806fbc6b27d7c3886d5a046a12a3d1626e6 -size 628303 +oid sha256:f71aa915167554a09f97d1d109f71ffbcfe365a0fd263adb15c0593b2f63b9b7 +size 622927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7e0eddd300..d72f2fd94e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:640aad86ceff819563c5b3c4daf8fb2eed3d457c7ebc1881a46d948e5b9d3eeb -size 1311539 +oid sha256:077992908f1115d26140ea7bf043f5888c6a65ae5469e8905649b121d657529f +size 955697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ad5a894362..5a59cd8e2a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2c9fc3f93e39b47f4e5829993eabff2238161ca4223c3017a1fef0ea1193c38 -size 1195005 +oid sha256:6e46dac7d1944e7b8712f69f5505e603c1689f0540085f5abc42b02323d423e4 +size 839015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 3953ea28f0..46325f1b45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7eb6035f523525579bab01695859c96aeb7c9e9e1d84a2c484622f3a3b0c58b5 -size 1307541 +oid sha256:f0e642c66a92680e205891362c400f49a19be1902525ebf3a2b268ddff315061 +size 955153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 2375f14eca..93c73475ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd95450e6666923e9eea1ec0f5162c136d556e0c7b536bb41541aadb698eab9d -size 1184051 +oid sha256:d71c92588ebd5f4a38a6f1033c096aebfc992f3d447b68dc44e5cd0f4ed0da4f +size 846857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5cb9682c58..2115eff47e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6172d40b3fc262575f1e62e2e0330117be92370fe2db131bd81dad9b90be2f2c -size 1605301 +oid sha256:05fd58a6fb9ace98b3b6aebc6f03607369295892b970d570806c7e978db05d1f +size 1080443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 13aef4c589..af6d40a0a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecc364986e532293c3887602a0fd7062adcfb889d96156c6ee1cad8e292f0091 -size 1489951 +oid sha256:76f075f590220c69d0a0c40a168af5e8ad8c1eabcbe6af5f17caa765cf252bba +size 974419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 32fb61e9b4..cc65eaf182 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:406aa019379f14ba7a8e70a2f08b3d3d1c4b866d8493e0b7f8fbcf80664ed782 -size 1383369 +oid sha256:c21341aadf468f1fef42c4fd31733f8299319cd69d9984ceb652e04be1690eeb +size 959893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f59bbd0c1c..7e0880e3a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ce1231bdc08aa1dfcd135f4be0e96b8229a69d6f3b5a113d2110a2b6a4d0793 -size 1262249 +oid sha256:f48f4e6a5c8c2c417837e073da3146e3b911cacdc4888583901ddb0b917fef79 +size 916471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 479731b867..25913ab958 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:290b4693c11bd551acf9755e159e39435a31bf4eb02a9aff6127107f12b61324 -size 1377697 +oid sha256:3af317a7e2cd935a45ec409975b8865c99c8e206c1f53a3fc5886f15abe493b0 +size 950075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3d4b3422c2..080d466dd7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ab1e5e8f04ef9882dc405eb8c8c4c9daf2e61425bbbe6067322aa471f3a223c -size 1255095 +oid sha256:cee7c2212738ca5022e59376aef5b65cf3258e834be015cee6bd8c0c142d6529 +size 906457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3fa271afde..db1957136d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3250a6fa161feebc67c1f3083028ce3bf29da822c10ff971a7747a8237ea580 -size 2020097 +oid sha256:47bf996ae175f65a7c8c62d16c89455e6daeff116446f7dc65a8b3e82564318c +size 1134219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index def8972a65..3feddb642c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b922f97479a71bea55bbd55bc398d6f87e28412b35368fc631ec9a34cf057e9 -size 1997003 +oid sha256:30795e3c9e22aba6c842160e4481130220e7e5410510aff99c9b282d86bc6e83 +size 1112605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7ca3bd5b4b..d9a2a515ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2dfbf776da4d62eb1c209a751f5e69977b6153a2c02808e1bc142f653a1be1e -size 1264143 +oid sha256:c700ba347301e2a7f80fae43b93731850014df62b265ce4152d30bc9ddff35b1 +size 955069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 67373f487c..f6a55440af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55b565d8128d9be50aa9784124d245b8117e39341c87c3e969d612e5f127e96b -size 1147117 +oid sha256:25b8ddd7e2696cfb0bfcf5cbd24d7198a391fb66455c9d9f09919c6b637de265 +size 849537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 5e3c6a7ae6..7a861df539 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cd3af0f8e8624dec6fdf0b9012454623b96ea1d2c6ea0e57c3e0edd4d3b3016 -size 2012945 +oid sha256:4f6803387cbd9138c303bac5a183549c9069d52cdb968ce0c2cfe4534296f5ee +size 1124155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 399bfe87d3..d347d8ae84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a78bc8adc52e550d3722ab18783e8149b7ef5a934d88ba197a09bbdda797e8de -size 1989851 +oid sha256:86b5c1952ce8d68efee3c110b0ab8bc83914ce48c51fc99b161eaae1c129c3f5 +size 1101751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d64edcb859..7bc699cc97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16d51800e0013cb735d113f2025146d8617b1fed390ddce2e9aad01066188800 -size 1257779 +oid sha256:8c75f66605c116718565cfdd72237c742b60472c4fc6ff9b33207bb920fa7b25 +size 945301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index a98d91a8ba..407141e225 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05961d4d2e905bce2d8a84f5faa8245d46c93067e0aaa320053517200ae119d5 -size 1139963 +oid sha256:fc6479acec435305c5f91d85b3ff11825b19477e6180b4d0ba0888827a739303 +size 838685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e9a590777..b04878f584 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b3cf2585a0a9906afafdbc1aa394f5514cc18b500301ed11b87d4325b4038fe -size 798699 +oid sha256:4447890153e6f48b4b191d0580db01559bbccb991b6966903dd5ac209012f8f3 +size 791299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f580210648..a0a607d87b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe52d36ecc7234820b3f961b0aff20a9d45a7da7757e6d11f36aacc44a3408c8 -size 717543 +oid sha256:8f9574e6c80f974a0e5a32f5eaadd33acf8fe019491ccaab37c819380591e296 +size 715323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 341be978a1..3b3ac56784 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:345d2b37e1b7e81f994d964f65ea3ab42a5c8907e86213d30fd7daa62ddd1c89 -size 796917 +oid sha256:9e69086b0cd80f23685f0f1582e62bf5570e9a9f142b1ce6fcbe6594c7698ac0 +size 788975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4270e4b134..56905b6a38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03488b5374f859158c9e84e1acb0c26c9658b891cba9f9f99e921261cc99deb7 -size 741267 +oid sha256:510ab19c8299c9bae80cead4479e74318316b8170610758f364303badfbb547d +size 734607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f08fb64390..4c52b0eeaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c47e533f7db887e36635a907163b058014ae55897dead5287f62843647769491 -size 1449129 +oid sha256:f246d354659b64fa75a86a1e24af72278677a3f98a3cfacb67865d383df26a3c +size 1029155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 66c003c921..ebdd251e1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9113f437668cad9a0be6c4601ea223b45b09a46642309c8d346a4b35b27cfca -size 1249075 +oid sha256:a63b89aa6f4b89a1dbbee2d07e51a4fbffe3ee13ddca601dc2b14d1b8611aee4 +size 917751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d672ee301c..1c00a6dbb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e2a50de2e1eb1e8d101add773adf21b76f0c7e0d15ad306bd512066fa7dafbe -size 829957 +oid sha256:b06bf7dec580b3c97ba4e338b0008cdc083e5bee319c4bcb43b9835066a5e490 +size 776825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bc50e60f8a..2d81fec0d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49d63c83fdf86d06565bf9276757e91607745871c34ca7ec55c3fa6f4aff89e0 -size 659601 +oid sha256:60aab8c47bd5a5bba6f9a8317b20ddbc11fc5a9829e64cc0421549aa34517712 +size 656541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 03af9f7dc6..b61bbe046f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cac2eb0fd21304c00117fdb11be4a3606b7ab1cdf0275fddadc8fab035a28b4 -size 776033 +oid sha256:6b6fcc0abaa31e7ab8b9ed80c1cfe74729846e82a8df5fde43e0f0b332137b69 +size 716685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5acc4c985f..a36e306b41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed49b07e7c5b2795de505e35001730ae3d3848dad756c6386c04f5b15bab01dd -size 620477 +oid sha256:d2d1c6f53547477396ba8d329cf7097e92ea52aa0287e0c11a14ba8c503d4c8a +size 617368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7933e47b14..dc86689589 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1c24a6db80f03571e7b805f6aaa467c8c2ff1b416e67b5503f2508500192a44 -size 791545 +oid sha256:4b88410c87d6859936ac2aeb62e73c136e22e90ab04bca4f1efd6d9cee574500 +size 780445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4636bdef6b..7f1b1bfd8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:636d7c076c215149e8dfbef873c7e20eb68ada310b5163951f7c1f1842de1b9e -size 710391 +oid sha256:8581b0c1466d86df192987d45730a184441d528f956480b9054f1d851a0e3e96 +size 705211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5d5f1de27d..e0e29f8a37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c52ab9c8337411d9196fd27bd4028fd1c895d61fc158e35a617c7d44c884139c -size 789763 +oid sha256:734343d81e640ae9c06718585bddac4b7225a96e87fa022feac4ee9f6906b520 +size 779699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f295307250..27aa032713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b6e1af25fd06419682dfaf5b1e722b5f27613174cdcb341c090c538cf28f449 -size 734113 +oid sha256:3b37ec93d41e4c4e9369f19c632dced8e3ddc9590adcb73e83773f47789e43f2 +size 724543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b5a09461d3..f4a732a61a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1edabb67d469ad9b9d8586a89d9d9d6b8f56f66a90bbcb8c6e44069bc3406322 -size 1442913 +oid sha256:2cd5f9c82dde19b1424ce821a3c08770e172f3fdd21989c5f22f142eb940caa0 +size 1019435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index e38e9ff4e4..d798856cde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1c14db897ab4ce16e987294270ec7733bd03fef3f06afa62019ecfd870d9ea4 -size 1241921 +oid sha256:e60fbf9e175378df35af8f65441eb69ee2f4b517a327c9ed4fa7230f79d20d08 +size 907639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0da649a36b..c3c5cd804d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14fefbc5e59065f6be98e9994bc92b51faae22cdfd7b5fcfc0a8f4ff51ffd11e -size 823593 +oid sha256:20b92b07513a647aa23adbfc91b66aadeb153d2584f1a36c869075c45b3ef90b +size 767105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 956cc79c3e..d0e552ce5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c359ea53a16ffd3435f8a73df4315cf7ee4ee811ad3fdcc9990c075ff18a515 -size 652447 +oid sha256:b85d29eedb788e15b4bcd2ca9f19729e690753d8e99d46178fa3a5bd56c02499 +size 646429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ce0c68d3a4..d6a2710beb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c05eee1f95b7d01d6e790526e18d5ffbe27c9d09e8f7bd5799ebd3d658d3f973 -size 770311 +oid sha256:5f27f1ba48c59fe744e2b23faed035cf7349541c6834d90823d9a018a2d97702 +size 706867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 671ae38633..23d513cdfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:623648fc6a080b0dc48cce8e61d6dd33f8590fd534b3d7f48a18e5df521a3aa2 -size 613322 +oid sha256:d5825b9d94752fac119f516a5b6e33578960281cfd920208f4b77e77c8d21db3 +size 607304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ab2ac588c0..a37cf7241a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5975780fd599d4431a00b12a1ce2210d59fd6b6a77388be6334f2044dbf23e1e -size 831391 +oid sha256:4b39969610e68350f1aba25b7fc33ba559a436431677eae1dd8df40d5eff8e51 +size 799375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5ae8217ca8..59ded976d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baffc91973da27a16ea1854aa254a71a6bd2272d4ef429a5204165f2f16c270f -size 744415 +oid sha256:f6a0cc57114037f496bb777f782e00f47bfaea3aeb362439ddc7978010a640c7 +size 736817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 205625bfbc..ae14bce449 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d8f79ae7a862c76a5e70bfd5427f005a158046dc0599419e0e6807292326703 -size 828821 +oid sha256:925717cc147565e6c35123f4078fd261046ecf296466e157e9bee23f0cfd483a +size 797593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 122a18d05d..3b5e532777 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81d9d8950a2034eba9fcd789541091fa41cd67748f108373403a99be3e6d7192 -size 770803 +oid sha256:bb44faa45e4fb3713b40d6824c88dfacdaaef81f1edf00c5a8f7341f7c56f70f +size 741695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 87a1d5b404..86224903db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dcf7bea83770c5077615e4fee7468e2b0c8e377fcc2dcecd197ebc441f97272 -size 1677921 +oid sha256:94568215a58e33999671cd048b9b2ea1f802f5ffc9ade5e9872146133161ccff +size 1115917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 72194e7fda..ee0b278d25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3b4fc46b1f4b7818cf7a49372ec6981e62ef3b0c34a9e7bde8254c4001ef207 -size 1556061 +oid sha256:cc1cbcd2664f59e82ebca5bca4fb3a1fe99e33ac9790e6c214a5ca8954d6a97c +size 1046003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 55591b2cfc..a64fffaaaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e525b34125fa1d4a0d3a72116479fd339db03c4f68d4e1449aefa6542165607 -size 854163 +oid sha256:3d6a5a1740826d167c9ba481f3e584447c4ab3e991429ccf46bcea489aa1e764 +size 756039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 00181811c5..555015de68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ef1d406ef431e0383e4982c0a03f2b157e2b38bbd0a0cb75851e7767cf563d9 -size 678035 +oid sha256:1a97274b7d996f7a95825f0dedcad48354f5a3c0ad6cfae68c1d494d5152837b +size 665603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a8a73560b1..29b3e281be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67cf0bf35a976bc50219e0dda32655a0ba198302357fbff37c94570f811dc0d2 -size 798513 +oid sha256:938fb4c52fbabe2ff08e1005979ecc6179a2f6ec5120557ed30254c57cca3167 +size 710355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d0405f2159..d93e6b7ff0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:875c422de595b4ef40517fc33742f855ec7d6aa9fdacdef55ff75b6f54ce69dc -size 635065 +oid sha256:bad51209ffd5dc310db104631bbe98f65227200fe5a1485fcd3c97272ebbf874 +size 623717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 60b198da94..d9bd77fdc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e53a683167c12dd2855dd287de14fe3703d17fb717944b6bba2af7f5b5a430e -size 823499 +oid sha256:f487be426fef1673ff30d9a6307eb5f435db8730032af107b9967f858600f99e +size 789311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5a3fe049d8..9662991548 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec3266c9c48306bb27f48ada56adb56d91682ad3743d01b271fe02c09860c5ee -size 736473 +oid sha256:8ff900fe224877db8262e30e843e4ab1f6e4d3561780f1c2b1087d9f1bb4a3bd +size 726753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 774e2eecbd..1984e1cfa7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4174dd27d32777105ef0a67fa089c9b671440649e2bdaaa305412af6520b60e -size 821667 +oid sha256:bb42b001c088cf72ef20557c4709739f074b99f56cdb839ea99069163397ecc6 +size 787529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d13f2bdf19..714c45bd07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9bf2aabd24b096f13498e257a8899a2f5ed76e6e81e619235800408c7c9395f -size 763649 +oid sha256:8080080b3cb427be04745bc494779007cd541c1966755131a42516810e8da015 +size 731631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 97b8afcabd..7932cd5665 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ab76f3856bd5dab85958f040acdfa3436a84ca7b21731922ecc4eb284ed3950 -size 1672249 +oid sha256:50eda4b76bba365d61646b6fa90cb05dcb6ef374c6dd2cdcec0a4bb939ccfd38 +size 1106987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 67e1d718b9..0c9f0d726c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51eb3ce6177637c9e8d7beaa1b84f5bda188db8f3c7ed840e18290d7dac2dd68 -size 1548907 +oid sha256:c417ab52b3a91a68da549fb45aba4a4f047ce64026d67722e79478138e7f09f0 +size 1035989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1374b1a145..6260ec9d9e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a2e1e623a3830618cf3d1e5f1898d92debf4a97a48ab17eddb6abff01bece96 -size 847799 +oid sha256:e52a1d9d8c82ba3a63fb8c4aaefbd8943c433d891af859b257e8b65db21b2c3d +size 746321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d592f8a55f..cd68647dc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57f23eed35abbc936871498b9e39f164f8f3984d2d2d1c1f88f4cd546c37095d -size 670931 +oid sha256:441501dd5c456d704cbcaf6f55158b42e925e6683f9d300b641a44eb4f745bd7 +size 654701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f9d8fff85a..80ace4f80d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00e3d1a3589fdffbb0d70988dd21c7e846b2245a65d75fab58622931020e4f4d -size 792741 +oid sha256:ea5d18703083e6d82bad46478ec77db6894b178e8b72f8a1df17c3709b46d3ac +size 701425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b6c6cbf827..80ded96f0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a87affcd9ae5a0769da3d8b789fcdc238e22445693770cb2657efefebaf9d07b -size 627911 +oid sha256:a37f22caa72cddb57554b44a28e397ab3f8e99433860f6e8971895a179b43a0f +size 613604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 24c552d14b..4db5ca4ca6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d73b9a1b8acca2bd7b2dbd03d8a55028e549375d9f247acd025be8c7cc075bb -size 916527 +oid sha256:116485e4fe05605c939d84b7bb1567b129190829683764d2da938e824c09e499 +size 739863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 94a78738b0..7f11e267fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1dd43b7d7807ae6a19388cc82a567ff605343fb76a8c714ef602fba5f970a3f -size 825499 +oid sha256:14159cd4e96b7eee8f2afcc82b978e0a977aeb9d792c909c884010975c1d1371 +size 650661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index de7a498e14..5f9795767b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bb17f41190870f066d9281dbc8b8a8fc035923b5b75450b0d3cf4215da03bb6 -size 916081 +oid sha256:c8b99cd6a37b20b042c5bdddb7728b276647a4f1fa66a48a979eabdc0fb85bd6 +size 740009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index bc2db64098..366e25d0f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49d1613981a437df9063c393f11265e0b30655bc8eb96cd5c46edad2cfe577a0 -size 824115 +oid sha256:143826eb6cd5c308f56d0f3300fd8dc1bca23a47bb512d96efad61067376c148 +size 655197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index dd2a54c35f..0924ee52af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0266dd250ccbda2583dc37cca4cb6d96291ce8a718cdfe42196cfb2b5c829596 -size 1050203 +oid sha256:d8d22a3104193c7f2b24f23ddeca3594d93514f30484a06ff830111214da2835 +size 808075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7eab977c68..146b916f46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2255870528ddf3826f021e9b666ad664b028d9938d9928e5fe7613a7c04b0924 -size 974221 +oid sha256:fb264ce19f642e7a31dbc6ac3d39b85fd580eff20141c2e51a26ea9b97bf882c +size 717293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 992670ca13..8309ce4af2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ea64c2a1bc79e3cbd30823802f282adc4a23886fe5467342bfd22d5332d5e4c -size 997977 +oid sha256:e95aeef7bba078fc955be6453ef139afc183b4ad7e6c1319438186f9506bdefb +size 775977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 07e9ff9bb2..e0c74d1766 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ada105b2523d250d45da3cafe70957dfde27cf24cac61e142ea0d259c60fe64 -size 898809 +oid sha256:c961c3c40a06e30ec5ed9e22f8155d9073fc6712028b235cecfd2a767e5e6cdf +size 738229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 989a2f1b4f..f05414d562 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1845361377a1fca1d1fecc32f01286566ba1b01645d9a6cc5bcd1a8513500c60 -size 986089 +oid sha256:322ae2a99e041c8716b116a953b6a13d0facd6f7002b02b669a61d0b56775e8a +size 757281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a632811abf..8661838e35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63125d2f1039e1abc84bc00300ebe71ca46a6bc2c61e6cba2f5b76f7371cc29b -size 884503 +oid sha256:10567480b864be1e4749e524ec6e1d8e19dd0d39b8a216f832949f450a4c126b +size 717263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b3f3e739b..87a2f4a01f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40b5b61abe9096cce82a7b22438ec573605591a1ea6f2fce77a399c9bfa013a3 -size 906887 +oid sha256:2d603cf1c8909e474c5e05d6a85792d2a58518d17425f5e5899f6c92260884e8 +size 920503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5a7ec22af5..826f65d571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87c790c46a7aeb8000ef623b24ca9d3426a4303adfc5786ee9eb0a39419e9bef -size 831997 +oid sha256:e88c5fd093a7bb312bc1ee4e757fbeda2c2c55b04a4b09a7443200aaebb2d116 +size 844971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5012ba44e8..a9ce6573d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50a4ed8ec0473b16763a80a4a9c12d192ad4333fc65a133c525b04e0e1d1b97e -size 901899 +oid sha256:6c74a4cfae63465b9ca4dbfa34f2032fdb8c8a2279eac1390b899bdb5a2b601e +size 914971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 85bd263e11..3dc7c27c62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1af33ae93964f6be3745e83acc033d85ecd3f15bf59f0916ef21114fb94a3d9f -size 850885 +oid sha256:3f20e7f59767836f80aa1fa0467d26a1f1e085d30249a292b1c9c658785ad2ae +size 866821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5a0a5bbc2a..922813a11a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbeab184960349edb58d77de306ad5240d8bcad8465be0ab294caedb7e7f2a0b -size 1067239 +oid sha256:9a8792865f567a4688f252d624170d890df8c3c58c08a2f003060d71400b8c65 +size 846719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index ee1ff2aaf3..7ada85d127 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2969e3d18e2555d3552135dcfe016cfca3f66810624fb62fcc6e1979c8b550f4 -size 898807 +oid sha256:abb4f66432c22d1b291f11cac8482c9cb2f8eafb4f2f94d1b01752fce60d6981 +size 742865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 17368054df..7e989f3027 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1036678636042a70e0476835ebd1d1d74161e71bd86cb85890fbeef8fa363e14 -size 974009 +oid sha256:fd3add0bc0b5cdcfb172ba3b3627f0596288a6cd30c264675269071b6246df47 +size 905189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8357cc2d4b..89ae96b294 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7771d6d9f850bd39a2c5cde9ce199d123de564f9ebf118b3438a3c8e0d635567 -size 764237 +oid sha256:52b740765264c18d23c8b128632e0579be776306befdd3519e40f00797b05e32 +size 777063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a2dc08554e..244ab3535b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6ae2afe16149d6a4d3bdeb46c63bfdb345600ae3152b10984250aea0dc04134 -size 921023 +oid sha256:29720a97be9faccd0dbb89080e0e50d13d26e064d69f00961c9108c767cdb435 +size 832767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e4f971fb48..94726895d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65e8b38d6b21f9b97fb193506f1b3dc9b9ef78ca0c8699324d51352a16cc745e -size 722597 +oid sha256:a4c893d64ff55c38a821e5fc5f053df6373b3d3e7c6f774bbc929716d5b61384 +size 735325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 87cb8ca2b4..0a92f943d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3f975143efb8369e2d2e4829a5859507b0509e61f38231b5039d95bc92fa865 -size 894159 +oid sha256:927e56cbfbe5ab27088702e134a9b00cc95840c2457241c3f67f4fc08e2234cd +size 898747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index be0d28db21..d1a9f466a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7998023db8ab35867279994fb4a9af84db20f51af1e555aaa367d8b244e53d90 -size 818479 +oid sha256:0fee64a64337018015e3064ae4e3ce384f789bba5df531c261acfad590cc9f5b +size 824055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ac8f3257ff..97396290b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23f751a5e881e16636fe3a4e012f1faca5a961e46868ef21a8a7b0224e2a217e -size 888431 +oid sha256:2a18ad442baec6d9dbc8bd425024e515b885806a1d870291822fa09ebbc28870 +size 894795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b4c2e159cf..c1b268d2f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5de1b937a794a21d3a7f48d01fb8035ce346824ee86e7dc2800d347f741d79d1 -size 838207 +oid sha256:cb24c4f14cecab4db0b91895b3a4989c09234d7eb9aa2d04b1b6cafe58a12192 +size 845113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2725bb81d9..c316c40822 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e5b9a1df93ec63d2052c0989d0375e10c2f541280f5c18a923385a95b59f579 -size 1055351 +oid sha256:2dfb3b8b7e5a00a3e4114a5a4259f3d7726a2a376a319eecfd138908c8331d87 +size 828023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4ae8a732cf..46447e2b99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35bcae26ccb1e8d6a84c85087ab24f582a0a31bed68f5bbbcae4e2b7da02a736 -size 884551 +oid sha256:c94cbb95428f6b296565fecd56d9a0b05ee8f0c96525901e2b10573579d57fca +size 722687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d633bc0eeb..d99e10fddb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a77aa41a8a09959e392fc720c4d7c1a5f7a33fcbe37d83f9477b0303b7f87861 -size 961873 +oid sha256:219d3da17aa931b9895232360b273f3d317662e6b48076a07263d9f0ffb41113 +size 886493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 675f9bdbd6..0bf194e324 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d85cb04faf055e8f6e755570ed43d693094f39b35c0479b839114f08b667c9fb -size 751509 +oid sha256:08a3d5dfe04821c2a2848665bbda5d626dda864ad092744f5635c284b41ca724 +size 755307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 35245804c3..0cb5fcd7c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:657621206b0c2fb924a3da62454909d5031ea1aa102f9e6b48bf3c0bb39a70e2 -size 909233 +oid sha256:84236cc990d5125aaec5f88511eb98c3a8c2241c9cefcb30d5794687b16c6f09 +size 813971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d5f839c75..4757a14fe0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:442704482b12f9b83cf53b5c35e2d985bf33cda590eb1b25686e7d9bd50a7073 -size 709919 +oid sha256:97e982021ae2b084313680f6c160d3793d1a2b32401b8a351463c8ccacc85ca8 +size 713569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 31a8483944..f8accd9e58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aed340c23345bb9b7ecb13d1602f6b2df84d73497ce02d2d144a450bf89b3ae8 -size 941947 +oid sha256:cc1d008fd41f12e01624c889d8e7b0e7df0329987f802e375077cd021c53a0c5 +size 929959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1d10c5081a..4a2ca78d49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9a2304ccb7ac3bb32d579f1c8a1bdd37a291a3eeff42d7773d994b0e3ec37c1 -size 859905 +oid sha256:d0f60561ae2604d0a91f912124a2c04d59627396f332141c49fe5933676a774c +size 871695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 259a4cbf77..56bc77be88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdb1726d18ad91afb74de9775ed51a934819d5a7986d794bc3ea02d8a0fe3788 -size 934641 +oid sha256:bd37a677f0c154749f5bcf7d3f4f77bdf9cd2203b4a79164abe16b784bab06bb +size 923639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce2ab54265..e9978215bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:044d94d1dcdef637590310e3aa411d3c4c71a5d4e6274036e8a95ba2be02b181 -size 882049 +oid sha256:30548c9948e975b7ab9cf5602b9947f4ad21e9aeb9afc05c2ded4849f6dd4e1e +size 873909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0f75d1a4e8..6a842f0488 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d27f31e75b555af6e2c65e103dc0595c8ec42f39f7545cc909e8ff88203d4b5f -size 1130963 +oid sha256:d9409f702967d0667418b26efaaf8af6426c5ab4d80411de6a58f85d9473eaea +size 860961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 82c1be4323..6ae1eaf472 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:743d6218903bc215723573c14097e67380684c50b2b1b8e30d7ddb8ea0281d3b -size 1047483 +oid sha256:1b50d3ed9eeada44f49cfa55bc06824d7f4d33b68d0e133445b26c8cb222cd4b +size 806341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3cfdfa06d8..5db7cac730 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59a7f6bb28b40b0130413a743fa393a8822c32df1a928b7f7e5f5057edb1b3c2 -size 996491 +oid sha256:f15d33cccef393b75c8f613f773528560c4b404dba24478d0db44a35bdf0eff7 +size 884405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7717a7ba4e..60e74d3679 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95ecd3e1888ab361ae3cb21c2e1f144f72fc8fc5b97fabc6093b19b59e1a9bf9 -size 778035 +oid sha256:83df4bfa93bf5b66a2b31332fe945b11cef90be941dcd1b49a50acfc39dcad20 +size 770585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index be92120154..57636636b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abae50a216f9f45bd3c37cf600b429daa20c5cb8c8df3d1388f7dd6b6a3cdc2d -size 944837 +oid sha256:528e67598177e0539cc8335d7cc2c56834bf41ae8767d30254a9f18140afdaf7 +size 834527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c0123f000..de6a19ee5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e319b697518a5fa9b8f4fa811391176012f4d93b8ab20afe351355184ec75b5a -size 730969 +oid sha256:f04a05032f93ace403e16b82914990d78e956aad73a07a0e541b4eefba7b311e +size 743055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a696e69953..78dd154ed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8f1961dbda31a5488e389c594baf907290a4bdba6ac2c5b20494d15865984ac -size 928479 +oid sha256:cc65120244fd80fcbb7b9f17ecae12a9568ace2718a877e8919b4b0fb0b17587 +size 908203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f9e1512b9..62e1b258fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec4bf0b728f23998be4391dfda79e2ffa096ca4f222ee29e4082d9a217a529d5 -size 846387 +oid sha256:4aec14fe0eb3cc8ef4e80834e48ec917b6073ef5c1909c6aaa50128bacb9890b +size 849939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c5f451b94f..cbc2d939b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b15bd103ec07c10ff47bd452e31c32f5e704ca13d7ebd9d6792ae84f9962076 -size 921913 +oid sha256:1dfb2555c6a5fc5afd85f1344af19c317b06979607f68a6c15652bab9b977499 +size 901883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8832015479..cf5b1b7855 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef2fb367c007b70a4ce909a4f5a8c140954eed9ae44856592497683b58bf747c -size 869321 +oid sha256:3ae325d9c34b1c30f92bcb6a8d9d498971c9f1209039e308a6ab79c7cc69e7f1 +size 852153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a109fee45a..62b5e05073 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99ccf51b9e48c24ee9a96f03d0ff254c98d4c3dbcc99e05917f7654c6960ce18 -size 1119025 +oid sha256:20ab089016056867d2b07ad0aaf948fa5e28d5ec509abd64d09abcf0cebe1e9f +size 841525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4b9e4c019c..0c47046764 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:962280a52df9d5095759b852c41810ddb573cced1403b845248232efc4d3434e -size 1033177 +oid sha256:087274df5e6ad003a858532e4f3aa47943d8bc12da0a6f3172081b91cf19124b +size 785375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 97be866c67..ab15510c1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d907dc7dc3f2a4acf9e28a4aa880bd002f769be010ad7da5be459841193af817 -size 984403 +oid sha256:299ee5f644e847d78947be629eb9216fac40f4b8af53fb4940f9b8bd0c89143a +size 864967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1f55c7a761..2e27430691 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:879e40b8a0a5113b88a40f7792efc78de810ed8dd6685cc795fc6b173e4e6709 -size 764567 +oid sha256:f2fb856eb0aab0acd26004601ffac68d1b17f168d992292e39998c051c2ff242 +size 749619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cc8333a4d6..69fecb5eb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f28998db441dfd986ae271282f182cb4156fed8d6ab68ef3dc25f8f07d9ab55f -size 932997 +oid sha256:8c5f72031d7655c780fa51ff49a80820be130cd0e747b2df182dbadd72d1ff97 +size 815089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d920a92e8c..b5992d824f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:616077c0c63db194338b38edc60d3eaa0d067bae515ae2ade5f86b0d61ca882f -size 718241 +oid sha256:e161bada587d2374d9aea4c567a78dc11dd5fe3d76503dc30b5e1a952af8d67f +size 720509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4bce034b00..2b74628a9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:848de0d49e9998278253f8bfc5a4478ddb889c7f72c11562bd72113172551e9a -size 1247453 +oid sha256:0ac8aeed8ed6e56f2b1e20fa0d49db935544690bdb6ee99aeb33d094a40228bf +size 883669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6290320d85..7d56f11310 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbc41f014e1c385721db76d5b0f742f108b875b17f4aab0be09d05bfff9ca53b -size 1130425 +oid sha256:c68853299fc506a34e86345fb5a3fdf2ae7fb948fa00e52b350c2d6d8c051b28 +size 767727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 64b07b5732..92ff8fcd57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efa74fd53254c8aaeac1125d0f65173da30c1e828ba30b00cc8f4c4ff0227fd6 -size 1243651 +oid sha256:4888d9c4b3b0487e9d93197d9e37b2f494c5284b5fcdf07d5e703ed1dcf710c3 +size 883913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index d19f8273be..3f9c16518c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a55d6d9629a2506a1ea5fc713a3cfbd0aea5fa773743ba1dd6a4f9a009681b7 -size 1119767 +oid sha256:dee8f49f412c5345ad4e3d8f092760fb24bd2d45d97d01288fdbf24252566054 +size 776013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 90d3c66021..1629ae8235 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd03620237c04bcc7169e4f943d16c162111d3bb3e5841a78c7195ca8e6f2f4c -size 1540425 +oid sha256:ab8565dbf7c3e0b1f69fad040b6069952603743d446b453dc450e974936d18e4 +size 1014829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 250b5fb2bb..92a4db6491 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e11aca41ca40aa6bbf0a5c4de62da820093d042ba86e902360b805183289f18d -size 1422955 +oid sha256:eb11db9949c68e0bffb1f7ff2dfb1320acbc3997c0613bfad9e7ccd56fd3661f +size 903179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 44927bf780..4573080a1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4484d1f09c799ccb4215e32f47a5a13c17b0c6fa302b1aa68475e2982ab36aa6 -size 1302707 +oid sha256:18154b8bd22e4aa41370966b2f1fc5f17731157735c597e4a7228c005600ebfb +size 887815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1b8a81cd24..4601cf6c26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e42623ef532ee595c772a285c7512af3b856e21fa55c9e495ceb2322083ba74 -size 1196633 +oid sha256:f4fe151f46b071eed96f5bcdb516810a395b4b7b5402a57a12de9649670f4503 +size 845133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5288cb3b1d..b5988e44ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eb9b7a57771b0de6aff2dc0047cae6ace4240376b90d4d3cf5a2c4801308bfd -size 1297083 +oid sha256:178f616066ea1471dd3aa7641bc29ae126ac42d4405e6dd9020aabf83c133042 +size 877997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0133af844f..11b2d7f567 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4652938a4d4d1a8d5f38f78b2ad69c82f6d17241339a1453badd7c38ed91ce4b -size 1189529 +oid sha256:f4ea6599a7efe27ecb6ff7149c51cad1972b43b670a2a59d00a1393c1eea7de8 +size 835119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 226b57c7f5..0a678f871d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e83b3d660d3ee40e4cfadd48297abe885a8f7776fb53c336d4bd2ce4e97403e6 -size 1896959 +oid sha256:88189ae3ef54de6f4bd4e5e827325ff79354ed84e39bcd12b10309672d5f8b8b +size 1057997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 99de9cbc3f..f028c4aecb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7411aa5d07a022396ce1ea2c8e40e64ddf43515864951df18fd41cc76b5f28f -size 1934941 +oid sha256:0cd3137b27c0d5ed235bfe908ebfb7d2504b101ee3604f158e01a1fb45f9424e +size 1045213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 3e6ba540b5..2153ff91e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87a04a3b15d37ed5033eb09bdc87ff0a0819389c9007bc75eb3c679a2286c0a1 -size 1202671 +oid sha256:5805f3154b23cf753df89f8fba8cfd404ff1e17d243b9a8548f3a5470e95e4f7 +size 901047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 46ead9172e..42c5c3cfa2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb526a0399c1db8d1537c53aa60393079d017402e600ab865d61236e1007bca0 -size 1089197 +oid sha256:914972a559babcd738930fc40902b9ad5906afe000016ff87bf2f2fdb97ca35e +size 794579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 59939e3965..b11fe23474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6eaac49f820dfd24132863fb5fb7a6db9b41476a11b32c444226fc4640c5b6ac -size 1889807 +oid sha256:9e7cfaa41109e1398288341633c8b28b72b97c04fb9173a8e77a49338f3c5ba8 +size 1047883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e7931b7b16..663dafb307 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:559bc951a40c05459bf5d0ce68c36f139ac8615066915ccad5187c1c0cf12b97 -size 1927787 +oid sha256:403b1a0693aecfee011b3ae22c99f4d1f6c31be5e7c502df00fae4134cef88dc +size 1034361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 37332b9401..a2cfc43370 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d907d3914d25c08928391e1c59fc774fde731136ca106319cb926b41d0615fa8 -size 1196209 +oid sha256:662ca854faecdbe582ed3f64d3aa149390aabcf8f427ea86359019805a7499fd +size 891329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4bc7498162..92334b96af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81fa5fda1bb7f9079b6e244d618ac04cd06c58587351f4251328208e4b1006d0 -size 1082093 +oid sha256:a9a883133a76478de1a1f120c52a24c719ff0c958e3ee0e280494707675d5c51 +size 784515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 126172293c..c61617aaa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5e624b2ffd76880f1ded83c5c10072fdb4e1819d8d50ea661e3c8f22417906d -size 746107 +oid sha256:e40a6d503c895d40fb34eac699569ca7970f119ac5c2948d7b30e70fc7c2975e +size 749511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce63be1637..90d551f0b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:266f5f1b1e495c674ed5bfa04d3ada17a42100c83df30494b8cbce213f9c62e6 -size 709253 +oid sha256:73c359cd2513f77b6c626c193efc16f841f9e95c4c85f4fb816b2027ea08abfe +size 714729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 765d6abeaa..803a613e45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe4f44cce4220f084de0763a5009594dff1fdf7a9965699bb8cf9198da7122a3 -size 767413 +oid sha256:0542fbd1caf1668425c0d3ec623016f00838d5585d4673756f19617637740621 +size 767957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d4c7a730cd..ce53059931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:411987b071f9670f99badc101777fa44d9df2f2966a9af69bd4486671f678d73 -size 729079 +oid sha256:71ad1da9cd603eae79374f5523a271674481d79ecae4d8ccdee3cf7f7bbd0ff9 +size 732435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5057e0d70f..a21afeb6f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf7f017b638219e64eb210dde5b1321f021395f2fc8d5e310917eb48165db580 -size 1369305 +oid sha256:63acfc7dcf96b8cff999c0af429c5a4dad464f0bd8816eac292ae12c6c82c627 +size 957077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 94198fce56..9be9e9a7f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f0ee4ee0dda5ea46294791c97d9ff49ba6daa814784a35fcb697c427f7c1495 -size 1184545 +oid sha256:4fc159c230baa68fb8ddc79dfe2543de4b921643a1a6043775a6718c57559217 +size 845821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 14090bcfc3..e48bed9bf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4764719b40f8bcb3159b41af2e70214ab8350581d387fd0a5cc32b37060d6bd8 -size 816387 +oid sha256:f2ce4c9249d652d94996702ba7546ffcd65fd1bb4accd240cce7dfdf4fdc575e +size 756447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4107e3dd9a..dd345ce60b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fd3f73ffaa3e2527a4fd8b31af2e67a12cc65a2e8560501704b653bd800439c -size 650521 +oid sha256:084d4f85ce685872bd39d48521a0f3de8070d9d71c1b280fe979fbc145d1b836 +size 653727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e1d69416bb..223bdc4501 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd3576aff321ef4136990702e3db047b728e595ef095feda3a126bb13f0ec1c2 -size 766115 +oid sha256:b40f39758a8855be96d4a620564a4f299e0bec559c40ad3ee21be6b5a4d965bd +size 716189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 28034e69c4..87c4fda70a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ee89acf885855b086f24a58bab636fcb37526baffd4b190d60996e613408090 -size 612334 +oid sha256:e638ddfc68fc0f7e8bd53bd0aeefe6dde35115cdb5dd8c0757da88a1e8d18ed1 +size 616034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bb7e6c75bd..f5e24e8e35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:425397190b46226233af56c706b6e76b72b05ffb122effdc0eb7b6fcde534aab -size 738955 +oid sha256:bf3ebbbc6edab1794a6c0a95de5094667ff4e2cebe23eeddf7f49f0840fa9f80 +size 739399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 15002da61b..e7b3a2b72a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:741e8c4ad9cee5505191f829c5d7747e5e86204d36f8b33ac1a9ec53a0e05330 -size 702101 +oid sha256:2223daf6c86522eaefc9d1240aa14a39a94cf85155a54585f6851a08951bc255 +size 703827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1b6c4d1878..66513aa631 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0002bded7a1fdb0c22972e70f9165627977052ec5886aa821d8ad4ad8451d3b -size 760309 +oid sha256:161d62e035ef54b833417e662c9f62dd80016acd4a90ced9cfe8d3717fbc83ea +size 757843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0147663d64..59839f146f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a95f7aeab11a59173f179ca379e21b7fe5d3fab657cb01eb00f0cb370ddc44f -size 721927 +oid sha256:f1627edf15a49c46767718a3b839aee34c5f09d3478691faaeb24f51630e034b +size 721581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index faba647180..f6b6722ec0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df311c1d14469841c5694f921e90c8fd68f28f3b26e51ea9294a8cd3656edba9 -size 1363731 +oid sha256:b6f1dd05f815fb900b4ac20eca563fdf2db3a66e2a7afe1f74ca9c6b9ae9c255 +size 947357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index dc0626551b..7fa532d2b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c204397540a66d43596a7e4f33583d35fb8f715f73bc7c238e2247e1dbc1a7c8 -size 1177441 +oid sha256:4ddb171e0f72a2d7518dac7cf77f913fe6db12e1e6437480602b27f4576f0bdc +size 835659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6ce8f7bae7..924958dc31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e99b71d544913f4065a037589e6dba3e936edf5707a21c384d6231c328b8eefe -size 810813 +oid sha256:f22be376d30fd0b3ae5434c6cf464699fbd8fa46e090260f9317032df8012b9d +size 746631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 28643d8cdc..9599560d36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd4a25e02289c9f179f932c798c5663f22952d8b408e1a0cdd3e1dfe10461e85 -size 643367 +oid sha256:248dddc27c724751c7eae0df6c80848a0d6f369e37921ffa0dddb9052b7305af +size 643663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 69f88b50a2..6cd2e915f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81810ba9bd1496da13e172761798526b5b19e1a548f7058f618a628713d6610d -size 760343 +oid sha256:38094936018707eb723a51fc80a43d9b4de34e9e658d1ff2a19579c132654fb5 +size 706421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e1b159df4c..4adfb16bee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82618c91cff73b317d62aef6d0ce427e7fe3778d3637e6afe17191e918d979f0 -size 605180 +oid sha256:eb50bf0befa1fd91b065444e69224b2dc351b60cbb032eb81272345da802b9bf +size 605970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 25dcfe72ad..871345070f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:057efb27e702e465834e5d90e34ee3168d3bd7c05007e55ce841dfd8a3577cef -size 775347 +oid sha256:5049f77d4faec23a0cc64c28fcacad952ef582a18ec447652e6f293f6aceeb17 +size 771795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2e9b99ab40..84a6d35450 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d9be4fa752fc7f364c700e0e6cecaf612e02fce758d87a343e7962a439dabc7 -size 736125 +oid sha256:adb5874ae41936812c4d95ed66262ddee529c83e6d280bde33dafdf440dfcce0 +size 735435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e4faf3e077..1a3d871ae0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c364bb310cc545ffa38ebf5a1795d8528bd7c9648d985c83cade2c10c8896997 -size 799367 +oid sha256:f53070c878d5afbf2a53abb524f354bbe94de81313a177d660d4a14ef89d794e +size 777363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 14bf270146..d05994e984 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:436e3bb22b153ed74ce783af78e9996fb0e970bcce22cf9feffb190b6cd66ff1 -size 759405 +oid sha256:5c2f9a8acd5d02ec32748bd9210e3f8ca7ede73a70e73cb1193ed8d8c0715182 +size 738733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1ebcf47289..8058698c7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abb4cf1aa93ffd8b973033330685ea7a1844e064442d8e236934bab16b79b23a -size 1596519 +oid sha256:a5339ecf84986f9651d9863eeeb66a92b5993728c87158de4d5224711938db16 +size 1049315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 154f60f916..58fdf9f68d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a518f48116d56dfcc22f6b0b1c4279c1a622836b394dd2097f60a5250d4fc429 -size 1488867 +oid sha256:5feb7a80a7cae3ac6c27eb2f8f2aa17384a1c35dd6851392b32529beac895c51 +size 974665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c6ae79724f..b93e19e538 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06ca7c9a51e5759b72f7353e2e1c4550a85855b42a163bd75d692666fbadd781 -size 841385 +oid sha256:201e2b6c01af22b09202a9c8341c54588130068f1917c822a058cbf188ed2b86 +size 753275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f25a8fe057..3854c0a72c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:900badddf21c8ee911cace7e138e21ee03b335230735cbde4cbd888bd25bdbf4 -size 669055 +oid sha256:a1e15bddb609f3746e8fc94963fa965a941226e72bdefda7df29e57ea7a8b66b +size 661013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 65e0b05d8b..84a47e8928 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70b92a0745cf79424dacae22de3639331414736278af51efa96f0cb3355af3e1 -size 789287 +oid sha256:4715ba4039007b1806a7d01cce605f217c342a2155dca4da173858d9d4a9927f +size 706801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee54f05d51..1e02172774 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:845c6d2be014e2569e25a5ce200cf6f653093b02c2ff42b4d72c619ad3bc79cb -size 626923 +oid sha256:e0a014875218afff6a03347fb24948879733f760eca9d3f00a4e6d66c06abd20 +size 622383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 644f3a9b85..b7ca51dd21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:220f4cce75427db801438f277754e4885d8d46817e4ec91d180e94fb58f7b0b6 -size 768193 +oid sha256:eb27538db67b8bd3037d01c8f27cd588ba10c6fbe97612ce7686acda4da68c75 +size 761731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5504e4aedf..da9958b61f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccae894ef96e72745482136b334925e3d6469662b0956e1259b14aab684f3e15 -size 728971 +oid sha256:a5ecdf39e01fcf6e522fcf2988d509edbb9e21d7a604ceff1356b3969903e07f +size 724581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a371fe17df..f032e3e1bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd0f15238c72283da5f52c9aca61a7f0eb8442a38f3352f97c8466f3fd29fdb6 -size 792213 +oid sha256:d03a355bbee12c253d8ac4a625f55d7a6087dc88309f7ecaa8b9ac526e60df30 +size 766511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ab3e1fb421..5a2d765002 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47570512b63df8e2d2284ce50970d23876716d024ac12a4e7ec0862440faa911 -size 752251 +oid sha256:be982b3d24f688b554a1926490f70108c22cf4d7d2eaf6b7736f85836200b4ee +size 728621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d9f569b439..4f6b3f89f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72a0094066920c368ea7cf00734be1d18ad22bbbe054c5e33bfb9ae7b9665c33 -size 1590057 +oid sha256:d57c899126c51e9aa863facfe176ee795a8a78344332f060d34131f237a96d18 +size 1040385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 100a17590c..cc2e66cc83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67765dd1a56399480e19a32b692641540d9338c1346a803c411b913562323cea -size 1481713 +oid sha256:00eceb2d820864e66cd162440e3186665386766692b6cb6fbf48001c251ca085 +size 964601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index caa5cbcfee..3c60bff01e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff033884f6e5f2a03c12cee00af0bbbee20ff1ec9a61db238f41d486f0cf40c5 -size 835021 +oid sha256:ba0cb3f96620b510b04400e0f9e47db6f153b98faffa921cfd0a00a2088f8ed8 +size 743557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1caebf10e9..87079cc716 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a62f22631afeb479f7b97158dcc5451ee1553cca9621bc6459b3aba4785386f6 -size 661901 +oid sha256:856b5e772fac8ee2c9a23caf6577511a6bea6a3f994d5f45988d4dd963e2f123 +size 650949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index db2c9349f6..9e9dedd22a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ab1c1ae8d8df90751d4fc10a317d0a5a83de09879e40ca75df546019d47d549 -size 783613 +oid sha256:e83a64a32ec5fb0537b1f136bd3bbee4a54b3c955f49fe090add548552c9c766 +size 697083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9a21c1a284..c11c68571f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fd322f9578d57f896a5c18109bfbea57acef491a636400ebf79caeef6ef9b70 -size 619769 +oid sha256:1dcfb60f3e59289c11ab943519f833522e9db2b0b21e2a5c97ea67b4097e65da +size 611480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 684f13a0cc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:118a3957bf8a9809d867fd7e3de64e7568bd2aae06f62511f0cef327fa0179b2 -size 1290437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5bae0eaecf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1caf8f7251ac74e05dbba301f032587e39a7de4b4982b511c3316dcb8d9395a6 -size 1186681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f17c2ffa34..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d016f01f6478eca8c82205692b12a84dc482ecd4c2a651f4de9890cef32e309e -size 1286933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f40765a851..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89d5650fa1b6854ccb242924b4d498d24417b41c372761e28572d86e69ffa337 -size 1185199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 62e1464c2c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:024848cf19ec6d45800a3dfe058933c8c03044abbcccfbd179eda9896907d694 -size 1436663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b1454fbbd6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ef7f5a70bac7f5b855c683510b545404819d4134d91b3ec050420d5e7464c9d -size 1274151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e31bc2de20..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:63d7a02c7d26cc91688989a6a5e8124cb9dd77dc92cf2cf53efaa5ff7eac79d0 -size 1427143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index df20cb8609..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f724fe1647ec8512a5711606dedec6c3b2823c915a01d9082973d68228c54012 -size 1263791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e7f275d9e8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e766577b34dea4f267842b3a8ffeced1c72d3a8055d4e62d6bf7ba98b4cfabf7 -size 1502373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d000431bbc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac6f884fc2e7f286336cfb6460f04db8eacd23df5fcf07addce8cdd34da3ed4c -size 1270153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 870e6cfb32..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0be53711715b2c5cd0dccabb4de5817433bbf9536cb3d602c17e8a048e6481a -size 1493543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 81b6a08b05..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aef6731985d84acd699a39ed35836db1ed60a8b682d15c5d11f6d9c1e6fc6c9a -size 1259793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e7aab04e37..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04fbb8565b0324f179c0bbbb15eb4d4fd6d923d519b6fe0a891bb11ec5577dd7 -size 1291231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 91c449b068..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52a051e7e43518cc35625cb36322e527d5b4139035a5ac731f8a6ac578c5a019 -size 1187475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8aa316b36e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e66f759ff74897fcca0672c8b8c6be270cacfa069309126f1e4b892f5dad05c -size 1287727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ca092e83ab..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1601ac3945a9a1ac9e20aabad036cf8b685bad150e1468b078ed77c0c58a4535 -size 1185203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4c350cf751 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04768c55f017c07e3bbe73319c6975e7abf6a95d272d26be67bd33f0b0cf32fd +size 964281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b50ba83ce5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea92d1731e3c6a4962b490fdff6773c30731780ccc4fdcfb60e4c23b15e97f1 +size 848439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6e69724ee3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cf11838d71327c6bd89e86b99fbe0ac0dbab078ac7b29e6b8a73afec96965c2 +size 963737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ce356f1af0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6df9bb7a73d3cbcd3b1f8546c78c7e286680955e50ed09945c5b97dc4fac6ad1 +size 855541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..98fdef4c59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:564351687bb05f5b254dd7ff03d00cb631853afa457badcf81361e559ff80c3c +size 1090063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..818df296f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba5bb27e65ce48021a2acda471e17125b588856487d17d962da3c760100f7a7 +size 983101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ddb7d7c0ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10c3cba243c3cfcabd86ef7cbe7fd1807dbb47f3984dedcacf0d275eca60ddf5 +size 968477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..80bb7805c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ea7cc2f109e34fbdc42f5efa6ea3798e5fc04fde867b5614805bf581b7e2105 +size 925153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6ded51aaaf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4101ca314097265867f94bb134ae8710c8618c4ab0b9cd318e8b1b1b4fee3155 +size 959399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c46d96dc85 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c866805b175f8a18e6dd624eca8c4a0f5fe20876b46374f697f6885c1b00f7cf +size 915139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9c483f989f..f41f3f6380 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6574e24f7544758105812ade9fc3958405015591bb6b9416756ef9535a4d95ee -size 2019801 +oid sha256:4587933150bdecf9586e87d487a6cf203ee3b1576b3995afd6407610f71f6cfc +size 1134909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d6eb19495c..cfb1e95843 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffe397ef68c1bc4ff07fc0d0ed189a7041487b847ec083275c43f3409d755c23 -size 1997695 +oid sha256:f20146985ad2e0f323f1de6be435016aa96120664bf016fd73e51bf7403f2064 +size 1112507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ba102748ef..ce1dcc08ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0567dfa3aa16f4f7dc84ff2db6e33cdfe620f112daed757eb8221a5673bfee1b -size 1270803 +oid sha256:b9fdf4c3a0fab790a08bbb0656ee9740ff3b054ef383daa72d75143829bd2a3c +size 960891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f94a34a87d..4e93d29d19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8548ed199696d23b1f51f1bd18795a95a22a651c9407fd3e51049a10f32cffb -size 1153381 +oid sha256:e92a0129211c07354ee561a844bf6a90e05408ca4071d040c24640ee6ebe6360 +size 855211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 66d5084810..7853650068 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f133853be1296786ef3a907678e5c5a66cc0d62570e01b85299fdb42e7f5036 -size 2012649 +oid sha256:db049cb5bed0381b89a295275563e29d886de38f69959dbd818ea30cf399cbf2 +size 1124845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 71837ea478..e33cf55430 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0cbcbf035b52b2393081bfce00924ade986090484bba52e4c5dc8edd074c190 -size 1990541 +oid sha256:56a8ff8a33b327be6e9842abd57cb6940db946cec8d02e6c18ad3cfcc0e57266 +size 1102443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d19fbce479..9c5f00fae4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f0f55e54ff399e777bf99bf5d8e734bc6268d63ea622f23beb85caca39157bc -size 1265229 +oid sha256:e4315f7ef03089d3ae46a0aeea115684928fd202e3e881e5e27b0955fe2826ed +size 951173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f1d2fb5262..9a2238630d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5752310980ab33d83a547450d8d52d6cecded43e019e12b323060d9cef95e721 -size 1146229 +oid sha256:67e6220bfad6cf69697001a7b0d091b2a8d7972f1ae7049e721c5e441c45e47c +size 845147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bcadca69b7..e6f295bc0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:270ae02732a68f0ed3e018c8b4584179b2990bfac75991dff6e205644d2a9935 -size 799389 +oid sha256:06a0e1efe46d8a3efc4ccf02b71bf06b30468116d70ff4ac5578521a72b09889 +size 791201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e53f542e15..4a7a0a0087 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62aa8f57258763c33e8c78e793ab797533bf91608346600cecde2232934080b8 -size 718235 +oid sha256:75a3ed64df997a582b314a598d48c85def212b50310c5f1b0e6ad906c7982614 +size 716015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bc9c51c9c4..a932801df4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3f5c344712b5e1b775038d386a354dc09528a110ea0aea842e4a8b44e7eacde -size 797607 +oid sha256:fcb340d470cfbf61c6e971923a959aa440cc7c2dab31cea0457bcdf97691b84d +size 788875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3def04dc67..715a55e41a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0be4b553434e41464972056a0862a81345e4f52cb3d633eef799670b155ea64 -size 741169 +oid sha256:9d6e481b9c97298c48653b377bd3acbda96be48ee5ae0d75af7de4f031e19b2f +size 735347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7c0cc9ebb3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0db0b3ae923cd3715c745be40818bf439e5d9a801c5d9b5793823e36ff95cd +size 1037739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..77a3715ffa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eac126515475309599a7bc48d3e4002aba9bc475a90ea1cb88a46dda2698e1d8 +size 926435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9a64362307..8d3509afb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:423e96833262e5ed2358af67fc8171cffac97178716dc58b749edda9e33d9e22 -size 839083 +oid sha256:2426d46948567105a64f1417fcce50b8168a24210c4e0fb57978e8b2f14e3691 +size 786149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6200d74777..fa157195eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90b986d8b3ddf604aa99a281685fc7e1e4972473939ba03a3df29984e2c137d9 -size 667593 +oid sha256:8bed356e95336786ef440fb8ddb2404780142873cbf84f276e949759cf462b80 +size 664485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bf597a68e5..d27d8a661f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82567f5ea17d1c64c9f1de3c6f100533861d5fa4d44cc77ffe87dc3f699836a7 -size 785899 +oid sha256:bff503ba29f02c12b9852b806300e05412f16cb00492f9840ef9732b95053633 +size 717819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 43df9761e5..2c61534078 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07c720b3c14f5cf98000f0eb5fbf3f318a228aa9816e7b7f6e387deec6dd73d7 -size 622351 +oid sha256:dd4162ed2cac7142ee68728b6c3e81ab336d2131c0ddf5288a1b56b84fd151f5 +size 619145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7a8865657a..7bd06781d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3a0372736247931c9923e417036e0687da35996dea85f4a5bb544c019096e8e -size 792237 +oid sha256:aaa1f858ca5cb99ac9548cd7f5353c6f2192722596914d15cd29f010a7fcb232 +size 781137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c9179743c..c8268db2ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2acfadb1ba16d1c5d02f3debbbbbc3fe10888e577a339fd37ddc525ebf88a9d0 -size 710291 +oid sha256:cb6c35cf875c3096ca12d52b6d7ccc6b911bcd49a91d23f4aba042dbb8fa632e +size 705901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b31b55add6..9644ca4e9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40bc742ac64f36d8d18e95b8097f28aab889f9b4ccbcddc9290dc616a354052f -size 790503 +oid sha256:1e71151fb7607fb2f36cab528b7e2057fc01767116a4ce87ec4f0d831cb12c48 +size 779601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3af0f025ec..8bd08f06ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6652cff4569a5a249c7245d274930ea77f94c681a72bf8fdbb06fd05bc154cb7 -size 734015 +oid sha256:e950f3fb7502a96df0cebc8ddae71aabf61a58599ed4287b51a05c452b6b97ec +size 725233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2ef7015f98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d3a2a927ce7d742fbe04b7262d7bb73c9769cd80d2f5b1188c455b0421e1158 +size 1027971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6f910b44ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:036d5e19bb5868e689d9c142740c2db659903ec31f9bd578562c8a62c780cbc0 +size 916271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 030c34efe2..19420356f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01fda8a2e9e39fc4bccadfa40795c8b326c7b32f652b4a67f410a8972f56ddfb -size 832769 +oid sha256:7ebdcf2bb505ae7a8865aff4e13defc772c47107aeec74c4da0fb14e3946c948 +size 776429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ed3e078f2..4c6401dc5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32152640e57d92d416ba2ddc92d87380a42c4c80b832aec4b2f50ca5a0fbcc21 -size 660439 +oid sha256:3dbc751a5730e8ab30ffc02707d5f18ff3061847cf19253cc4fabded97467ebb +size 654421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0d22b02b4c..40e7c42f93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ff37ce08c1e5d042aa076b88e9e7b952c874c1c32c186bf3c78b3e2dc847602 -size 780177 +oid sha256:f2bf7af54e76f983f7e4bca61308d320c6457d976e8a707e485d6315f0f26c28 +size 708003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f06486353d..65da92dd26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f6f740b555e3927624c3f62d502afb19d41003d28413577be1d6fd3565e7f3 -size 615246 +oid sha256:f75922ed25c2fc4845d2b904945984704f98b4f96d3c49a97cb52e19414df461 +size 609080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7be7d3e11d..636d25ad76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d89bc1bd50f4042420969e29a5e80757c3a23a2a20f23b552a744b4e5619bd1 -size 831293 +oid sha256:e46df1d7917caf3299282989a33edb244b67e41ed876673c425c80d91f72fc4d +size 800065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b35d78bab2..2d6abb88cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2a0127f3ca3710b912e0f5ac9b41cec34f7ff28d937d1099c1a5c6a5c8a0be0 -size 744317 +oid sha256:69f04eaa4e88d50339d626a4f81161e12f7702e8e959dcbcb1e2b871d92d8ffc +size 736719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 484001cf74..ec2f3f9d5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4396343f5db7b7be0b40fd427d16b6fdc4539641afe50bdcc29d171e903552b -size 829511 +oid sha256:cf9021dd1b9019d9fce4ee6de5ff2b6adcf757c4106d3955f887afa3f5ed415d +size 798333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d40704f114..c9363386c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11fceba9fb6a1392899a640b009b3210de03424ef7d3cfbfbd4403784db22bb3 -size 771493 +oid sha256:3507d62c32ed35ab67b3fa788768816129c2aafd70249513e5d10ac003ea8700 +size 741597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6268e67bd1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90f7115705fe3833b189fc83962749654a4673faf9df7a7e879c91fb43f6ccd1 +size 1126375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..33af697634 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62e9a75293e9e39d8057b300d86c8d123ef2be663975d612ae96a8f20645c49 +size 1055475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ede5cef569..6dbad80faf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec099a9310485f8cb8d0e34ff1c215c4d1bb306c7431b28c34035495541d17a4 -size 863291 +oid sha256:0460cefd2a0c997798eb0fc442a3195545a70eb1ee328fc7968ba319392d4e28 +size 764623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 18905c5212..e6b67ee6fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c4a5050a1daabae9abc2460ff9e793f0b092a245b9f8f073ec7da2667ff4e2f -size 686027 +oid sha256:b17c7ebf7b820e2a88358e198428a6a95a20e838e9eff0cfcbad2e1bfad67428 +size 672757 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 806af0003c..840cb77610 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15f76a7bcfc9712e16f9f26d3f3fee192d007fba97016cd6f222f5a01ebfec77 -size 809713 +oid sha256:87687d9be59208e64a81ff050473865ba5ce4de93832052222a8aa55f0f2e794 +size 711441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 291f32eed7..34943cc712 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a53cdf25c115f918e43c2c34cfef8ecfca9bb5056fed714116c360fe9977ffb -size 636939 +oid sha256:e566cc79938a7e1d6977f741a0e00e9f2afb0144c721bfbbf738ba7e7557b8b3 +size 626529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 11f96a6487..df9a512152 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f680254b5094fede82c02f5de207304cbc53d73400feaaeb3bb7b9f7fc47db6 -size 824189 +oid sha256:9db6be3284fdeac3f72cd10629fea03f4a82d65bbd241fe00ee5a43943d41d10 +size 789211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7f4c46f5ad..ed81650e93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b5fdf47121418588124f1742dddba47feac9c40e3f162e0e1452fc0f45a97b8 -size 737163 +oid sha256:f8a116194329b22e78efe6f8a928f50856e92e5730e65d6cd825a13e5cc4a1b8 +size 726655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6c1171b0d..94292b2f7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f779a158cc1d04624562e7cd9026681559ed941e2993ad60fdfa4b2afbd90b1 -size 821569 +oid sha256:128d774dda794455ddaafe81286130c05ab352cc776dd944c9870f267c9355c0 +size 787429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0776d4de9c..0112cad436 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5319aaf7081d95f9a173ea0aa990897fca4913af34e152ea46b2dd42f36ab280 -size 763551 +oid sha256:3c270eabc57c6be4e6948b82b4079ca4deef0a9b3a68d8ef6790ae7f9d4cb5e4 +size 731533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..70a98aa157 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c89a4385f71aa4478f2ed19c4046b4e8040b8d4d9525f959c05926483a600a8 +size 1116657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4d3ddc9776 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:370727056c93203f3f814a0bd9e7b7078494fa755523d2b74fa96823ccb5f286 +size 1045461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7ab1c57df6..f66cc24223 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b925fc65356d2d2a4be485c0a2bb83f41d8965d20fdf2693da08b2b1500ea7d -size 856927 +oid sha256:d3c2124c903cfa61c5eff0d05cd224cd473ebad1bac5fdece32290851d380c01 +size 755695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 550c9a348f..2a6ba9f9aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50dee4d4b35bccc8e71f129426558b6f1e125aed8a1dbb098c31a67fbcf8ae93 -size 678875 +oid sha256:106b5b3fd39d95dfdff384860ad78367b07724c50d7e375313ecb496a88c41fb +size 662693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 284ec33c77..08f46fab1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a5df41c4d24cd6ea6a66e6c537c75ddb905b707b4bab256be2a0ef26830461a -size 803941 +oid sha256:852f52fe0f8e30185180cda8190e85cd9badab25a9b5ddca4ff1ad4edaab8023 +size 701721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cc191bac23..ca347adbea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66bab8fbb23db1a3004e4970324fa25fa814ccb8e70dad65d19d2149ff23ae7f -size 629785 +oid sha256:6cf7828e22b90656836381c2d343d092a3606636479e2997f56e1e928799fc60 +size 615626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bc0d73e692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb21f0e0469b81ec6b2e76d6bf75b3c3c5e04144dfc8630990a65c6e9167d48 +size 756193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ac98e1107e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9311c9bba8e86bdc9430ace0725fd63bbe1a9893b86e5008c53dc223b0e1c4c9 +size 667731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ce3abd8337 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da64e857f0578d3b9cf18fd6fe20f0c4d8e2c002491836c7d3d8197e1422ea45 +size 756339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fc935970cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d37f1ae66bdeee4f720fad7ed1f77ae5f17fb82bd72cf6b98ed40d59259b6cd8 +size 671527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..df4102ea2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2278b875664a5e99c5342d0c053aa7b0d5518fdd624b834e06d1e3e1e12b4ba +size 824355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..293e83fc2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e548320597355c96b0dc597ce49167319f85af572fe6d680fbc139b4f209f074 +size 734411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e24d3b90aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe2979425b008fb27002ce6027616b79c7b90358f1f8c689dc5a0921d9f1c933 +size 792257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0127b0e655 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8870bc43f489a9232a32c062f6e6cdb68248d5e3c5d79e5d1e34baa501b3720 +size 754559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5e5586f7b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f8fae83b68b13b6a67115988054090fc54c6dc86835ffbbb26c1d1f8b0e13c1 +size 773609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..651a734a20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5668a01447e912f67b236c1ab5390f86e9364025cebfcba7f37f7dffcd26e5e9 +size 733593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 758d1bde88..916395de36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d102682904aad3dca38486e293bba74e85e7a579205f44f0f252147bb88c34da -size 907577 +oid sha256:cd4b408d095be17ad5cd2e99f3c1d66ddcc77ec9a4c903c4af9719fd004234dd +size 920405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4699533d08..e23223832a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f982bda7f4db55510a82a90bd52d00996d63485b360e7c41b6590109e256e6b -size 831899 +oid sha256:e6b01400b6516f52e401b9cb12c414f087d92f7b51869c63ab56b4bc41b862dd +size 845663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 34361fe1e8..b98a3a883f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7c458ef2210e0475f90051b06237f3490f3eabf83beb4c7807c9de066341248 -size 901799 +oid sha256:1fbbca347912d029e2a448eb3b51330f4771d0d29a6189c23f6e62b3a1506080 +size 914873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9be0b4c493..6d590da972 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d368aaf3592f673b349e5e482e273288abd06505bd866b1b583640e3ff6f6a8 -size 851625 +oid sha256:01c5a4a8faecde79dc8870f63764d8076b39c04cddf55bbced699d84ac96fe52 +size 867511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9120372c79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b15ec4f3bb867e52c048c75415c580f4c6b4be6e58ecc3f3a77c7de02cd9034 +size 862999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..65f44cc784 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:822b177533937b40718b6209aee11704c75d0e22174741b65dbe8112267e6a5d +size 759983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 875dda7f7a..0af6d483ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e56672649100dc3b4e5bb953f2cf84439b8d77da8f0ffd9be35d8ad0e4e8dbb -size 986145 +oid sha256:b01dc894abc37ca7fb18095e1e7f7e5793b8cbb6786e70becff0e26aafbc2c2b +size 916537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ede9a7f6ba..c4e1381f7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73d877f3d3c7e3aefd9c15e232b95947668d4915b1d05560b11d77b4924849a2 -size 774005 +oid sha256:f34aa1be0aa255f98a86a83edff902542cb6ac491353f7d111393ae228b90119 +size 799953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ab4e0c0d12..7058ccc60b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e514eb4679a090941312b07e6870c6149ed665cd5eef7d3d77af3df4715d04f2 -size 922455 +oid sha256:5bbd79f20022119246da0d880b973429dd951ecd3144f703ec2266957bfa2200 +size 834099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1b0f5c0cbf..f9a3267015 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c99cef9160e1069e766fa90c8c30f5a880d8f5984b979a071590f1c42866ca4b -size 725607 +oid sha256:a3395fecf0857af63d29954de689ea54bd4a3b6f8d30ed7a92aef613be3d7c74 +size 737397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d354c22d10..90235f77dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9752284c590da5e58fb41da994c0155900a2b8ac9b4b572a7bdda1cce7b31314 -size 894061 +oid sha256:ddb6bdb4aa2f78dbcc0b714f5977afc3576f5d8bfb3176d797af0385be6e2f62 +size 898649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cee85059f5..bab8192422 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f58d65b106f5308b3d56c95673a1876dddca64c9146205f106afa56b5d94179a -size 819171 +oid sha256:175c3f9b587b041263ae02a468ab1e31e9f27b0a487b125af53efa2c29dfa979 +size 823955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ebfcf56cfe..ccd33d461a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aaa7e919c80fd8b05f09a61fd4fe78c3b67144fd5a13ebc59c92923724b693fd -size 889121 +oid sha256:39d2741e008be1ad6ee166cf4518279a1587e03c5e232fdc57a7297c695b6b13 +size 893117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5915e91ec0..e0a3b50a94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0726fc1d7d583df4fadb174f5a38f80541eaf35806a88b342e7a377b777f99dc -size 838109 +oid sha256:ec985ee318ad68b231bea3ccd6836dd8f69a1c81ef75891664da1f39a7da4fe5 +size 845805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d2edb1e869 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eeb880180a0d034fcf87dcf535a8c49f39397467f8fed2c5ec1c242a8fcc389 +size 844351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..716a44e8a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f382f57f97034f1adffbddbcac2ca858c38ac005d4850b4b8497e1acfbc012 +size 739017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 91fdbec835..fc321ea4aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26834fdb8ba887cca91a7050b2effa885d86fe7984f9e8e5110d78236f4f1a5d -size 974059 +oid sha256:c17fad7d4cdbc8604c273fbe3d1aaca51a1a29be1a531c13a611f7dd1f730da5 +size 897889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 792c332497..049ae28531 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1bce0aa133e99d4ce8f505edac5da765a8251380708940682c1963054ef7c03 -size 762953 +oid sha256:22773d10e3765fb7e67c1fd43a0b7ec9c5aed3f3f49098ec65e65487c341c228 +size 778197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0e311ca710..82bfe5aee3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:396c7c9320d139e06d9f4dcf100118e65ee5b22d9a8954f63a8b25dccf17cbae -size 910615 +oid sha256:bfd815480107724603ed6b60385f49c3bb8435032490e825419754394e917c04 +size 815303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e938cc259..81dac45990 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ca99285a07f13df46176bcbe95efc8063b5c9441f18008f48b3608b3003b8d3 -size 712089 +oid sha256:ff8d5dab44f939c340823c737857df8a0e74b31a3c0063bc857506c860127eef +size 715691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ff11b5ac5a..6c6d6bc267 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c288b5036887b28f293b068894d515d9c61daab7501e3410e65deb8749a39b7f -size 941849 +oid sha256:7f820c4446c0ae280dd69b0afa54ad5893f24b25dddeffb24e308d08cd4d2f8f +size 929861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ac19608a6..d9157fb399 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:145fc9879c87446acf267c6569d90d54dab339dccc7d95880b9bc90c12276866 -size 859805 +oid sha256:f5f99a768675ee9d1f1f1ae243e4d3d83959692781e97739941317bc8a824e5a +size 872385 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f98e3bbcd6..4821db4c1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:103c764e85ebc0de2cfb52d3b47c060c5ca69fb773913685485ca0bae8e29e21 -size 935331 +oid sha256:db2ab258850ee60c82dd5e7055fa6925fbf6d99eb2d19735b88ee01eb52a0bc9 +size 923541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2121d510a9..023f0717b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cedf0b2d294fdf69e840cec5729ab5c13075a45261a4d224e6e48a002907903d -size 882739 +oid sha256:babd219bf4c175bec2c1c759f7d747ad42db1f0ae79fcecea8c9b1c4673be08a +size 874599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d87b1b32a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efb7a9199eefee026d209f1fa8549db77b5b8a169b7a5ef0ea534ab3ab91d900 +size 877291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..33dd9190c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca964e4bf54346dea39d2f8a9e9963b92eb7a5fe5e2dcdf25e357176f72980d1 +size 822671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2a0d99fcb8..2ea5f22614 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b1aa88a67906110b7a02ef39307212875f426f1b48aa8f88543eaa301cb5aef -size 1008675 +oid sha256:e8562289c58b23f29eb0cc63b31e0d10188064d9f17fb4c5abfb3b923f77911e +size 895801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e41ea49382..571f87f7de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfd752f2cdfa742359811070caea5d19193563ac69a27e054f359b16b75dc36f -size 788641 +oid sha256:59c5fd0548326c9a0b4514c62ae2e7a9335734084b7d1cde1d05a16a4b823c08 +size 797077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 788bfca76a..dc5894002c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3f1bd5bfb28664cfe457cd0f87dac330c72830c1317fea0f41ad923b98db4c0 -size 946217 +oid sha256:5a7228ba7098e0f055c4125ade56975e942c0519f825e509edcae82a8151403f +size 835169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0c584c31eb..8d406e4666 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddf60b8dfae3f60c4ca5d5a9e5cba990204a2e590a3fbec1ce3bc3561b6d1fed -size 733139 +oid sha256:4fa4249216af5425e7b2eae4abdb537513eee4fb9186a2f5a7d0cfb44cb925da +size 745225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c7edb52978..08e1d7a52e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c61a788ab8f4791dd6151169b5f338383ec978fb40edc9577b10013190aa32f5 -size 929171 +oid sha256:c5f76ba8ce9b112b28fc791d2bb087c141e953b92894c13ec1980f72a15a7a85 +size 908105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2e0719a362..5945772d82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cac01f1a7d364805ad903c9bf840bfe0c6c7e99ea1a3f57830fbfe08172c1ae5 -size 847077 +oid sha256:912b1204b7a6b7ead1a1cb7230504ed959528fbfbefcd71935c27a3d06f27ad5 +size 849841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 94c69856f8..1b8feb2bf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f042bc410bcc1fb6e2f2a6e96863260187f77930d94161ee34b25ac1d22af3e -size 922603 +oid sha256:62a9b0169f415bd7cc6bebc460ec61c24b36b8dda08c006b79fa772453d3d54c +size 901785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e629d7579e..86011b18ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab87192d4e69cdde34fce4a146a47677f4aa3d7703739c766191b5e3b1f152e6 -size 869223 +oid sha256:d22b12e58cef1f7cc6d9bdf4a2b9aa9f27d72b369e4e9ad3f362b6922f4d1021 +size 852055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9acfdc3759 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:830533e08bab3a5edb0ed175c90cbeba2bc805a2e6cc3c26a6d1f784c1e8b8c3 +size 857853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..72a26a7127 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d84b6d71c8452cd428ead139394ea7c0fefe9aac000c0fc996b22b8f4627df +size 801705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 34db31c9e0..f6fd7ff9c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9495b72b30a0282df87d1702df6e84b080c5881539c9fb4309e5ad2d62511c58 -size 996539 +oid sha256:727bba6cef2e0d095265c949125d24ec58c107b135b2d8d7d44af35dd816a7fe +size 877103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3131972c4..b3d4543f27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ca1678267e07f43b2c5241e0fc4bafd466f02fb31728047d9e1bd025e2f60ba -size 775963 +oid sha256:be9d351adfc1f012cdd0b7478d1a4125ad52f5cc42a799c2f0eb5b4b68fab04b +size 774531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0d61448920..0b2701bef1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43ca822eaa872bb41ea67d285fb62c951289eb6d27f25d424af53969b5a34157 -size 934427 +oid sha256:ce6cc63590e32b29e387e6aabda1422b9293d9c6f6306b5fa9e0db7e5080f48e +size 816471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce254a4c06..971566c393 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d58d95779508d8b4baf004a84193b08e93960541d173274cee4dd633ea66ac98 -size 720461 +oid sha256:cdba08a3483fd87187404dc8bf4698c4440e66b03dedd683ed35aa3a059d7196 +size 723519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..04e88fa6e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d759997c2cf245ab80d4a450eb1e9d8d095865e602bb15f967f7378e0e7f9c77 +size 887861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..06a9184926 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:798144e66e99bdabac6b4a85019e3cfad0ee281382677cf28cb9633121d0dfc3 +size 773105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6f1d3441b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fca01f31e249c7828af2af9acd1f19a017925ff6da326487eaf3bd7cf005f5 +size 888107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7d5ebbfd77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b472182b04f4ce2dd5f96430446ffc678aef7eed4e75d6092ed26169d20c1995 +size 780601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..701d9fe268 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81b3947b8fbd852d7dbd936a9de4b4d4d5301f21905e0223b96c368a8ca7f84 +size 1019367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..43a292909a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3477dc57a52c438364e3007bf3328a5281025e86e5c4f77aedbb88541a90c58 +size 908557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..50ba25247c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21bfe5b73c299d1234232254c7c9ff0ab13b33cfb5da84135ebb8f29d831940 +size 892797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..78069ba273 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b35529b5dc321ebc2721440bfb29ca9da3ad44e523fa0b552dc2603f54dc2a +size 850511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..974c5bcb69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97c0b7b42f033936b0fda7927a0d42f8c9cc6d29a1ae49dd10a9144696ca51b8 +size 882979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2c78a20548 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e553155594db324fef4dc41322508e9aee913125c4b971cfbae447935b26498e +size 840495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d3612ffedf..0c9d8fd8ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b0d0143670d77d3d81dbff7ca21404cf63e73ac8a3724277796bf01844c67d6 -size 1897651 +oid sha256:345fb6b11a5f7e60010aeb48205b8af1122d85fad21eccf80064a6e9ff550ba1 +size 1058687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4006eb594f..dcb1a022da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1abcec700763de982702b459b3b9ddc3ed345876ad4295e95f2559085d6a795a -size 1934841 +oid sha256:5f3514852239baf20b5b96ab82a7c1336b92a47e698a469aae8d13674f08f02b +size 1045905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c227a6cbfe..7143641b40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:150150564a262b1669feef3c35463de18d97407229cf7978c84e976b26676df4 -size 1208641 +oid sha256:f6915bda11b6e64bc7fa72539d4d88e711c956adb4e370ed12c5d7b1e4fbc0a5 +size 907559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 80b8cd6cdc..64f86ab292 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:326976bccfa2d9df5723af250412f4f2ed9f57bf2ca10dcc189d85b4021703ce -size 1094427 +oid sha256:2e9f4f38a891e34a8b4a9143b8c89d3028117f73877d29c2c79683c2e7e36b3b +size 800745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 26cc1ef415..1763098571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c38a84e2a55c66068932bd33e5eb10344d7fe9ecc033d1b02335fc306546460 -size 1890497 +oid sha256:f501efc717e26b5461e70543e38ab06119374a225c5c97ac924dc78dc5a33356 +size 1047785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8341e4e6a0..de3e3b09c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7ad5997d42ff6b1001d4773c66ca547e145e14f41ce47aea6035aabf381fc0b -size 1927689 +oid sha256:88c3fb2b36dc31c921e82efebdef9536e4047d3a217b3118c8fc822746f40a17 +size 1035051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c1c1e94831..158a8ca45e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb66ea76f94b116ec39935f18bf67f445f670512a00858e915e0830ceb5e22f3 -size 1202967 +oid sha256:3e79a40c995336a52fd27652c2184a53b77febfd0e17acfa2fbb8b74c1b19eea +size 897841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3fefe290cf..3846a26cab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3d3880809f1ef22e18950817031a1955f8a457b47917bdfed1a52d72684243e -size 1087273 +oid sha256:7dc095d82209708e9a90f711a91066cda4b659d58f9074ec08d3fed2684c812d +size 790681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bd0fd53218..e023dc5cc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ead1c57f05812057cbec4d620eadf5bbd00b76fcef910a8014e1984b7896ff3 -size 746009 +oid sha256:b11eaa59f0f66ca6b1c6f30c4678cf327ee785f9f7f4e9ef05b973955b808acb +size 750203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 023318677a..ddf9174f73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8468f72d822b696ad94aa97135b60f78a7c5c81c38a2e776d9e9563d4c608162 -size 709945 +oid sha256:2688adf5425e02544ac57b35335a919e001ae651f73e109a6480c85885b5f6f7 +size 714631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 30bf7d1d09..b32eb12348 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c4d5f4ac8d19a04aa4e116e3d25a0877b49daa2d2e579bae03b37575016bdb5 -size 767315 +oid sha256:060f0bccba378051ebd66cc8a0eb9ab50d378bf28d8739ee8ccc4ca7dde15ece +size 768647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2821c27444..e26a60677d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fff7876481cec5d9536939600ddb289ce033d1ed93ee58154d0227df39e9514 -size 728981 +oid sha256:29975a8c95110bc04600f7dc803dc3b6f1aea9d24b38e6fe451ac4b6d95aadea +size 732335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dd35ab2492 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:922d3db8a3d16498a4559dfe087f4fb88cbaf9d395fde67d254bc3799082746d +size 961269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..91fe130874 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e1e9e1dc9d5b2bcb8801e83db2308211d2ee27dd97ee951187c45f332096e9b +size 851199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fc50d2b3c1..b438a71087 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09521725c91ff06df7bdebd7083a32fe859256f31481a73c2ba0e5d23bd396a3 -size 817473 +oid sha256:4fbc305f6562a5ebd55b5996810f86cdfc79888b8948f46f90971aed1eb98d0c +size 757583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f62b2394db..f91c92fc09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:246af3d650d204fdd6a9ba56d572bb95c17009ca014b9fb3bb875f24ef63b14a -size 650817 +oid sha256:7a5a9f354119c0456269a1ce0be1fb53f9ab90c57f6b9f5dc2f84a669ed68422 +size 654763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index aea54a56b1..63677b69b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37e355e20586877e5328963d9af47571683bcabb9cc407af22d26adffd581a7d -size 767053 +oid sha256:e5b164c710ba33f2281f151b91d503eca2f546335d865ca8f97d9f904309152d +size 717127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f5e5b3dfdd..3bc12b21e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a4303bb0f48c4dc15ee13ad2f4519468ea9eb1e8f379c1af8c3e80c292fee63 -size 614060 +oid sha256:177e6d39b6bfdc9571c514ab91225e6b98006691cb40031c06d79f4ff587c3b0 +size 617761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7f52b66810..1eee447f16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37867bde87f5f55df7d7586387ab7cd1dbefb428cff23fbf1833bca8c4c04aca -size 738855 +oid sha256:c9c81229f8aeae65c247cda8161c60ea6352bcc50a4fd448386fac24ce4b968a +size 739299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 254792ac67..ae9447e771 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d8c63f1ff563c94759b4e283fd1c2e7030efb31b20e30acc4465b584cf3b90f -size 702791 +oid sha256:9cc60b71a5e8665877484d196f2f109e219287039101c5f60338d10d52e616b6 +size 704517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b550e21bd5..e55b5e6ff9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a23e70e1307cfc4be494e7e862c27fcaddb12d8e758008fd5c999c22ca8567a6 -size 760211 +oid sha256:12b6f5ec5fb365945043297f67ad130deff897d0151455b66bf5802f3077e5ec +size 757745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 88902d68e8..b586823400 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63a70e071a8ed7a827a9e923357480b5a184510054354c4bc9964916522bbbe2 -size 721827 +oid sha256:a16038038e3dc61c50e8fb1c8348c21a24ea3123379b6d6f0497bf8f87c997f0 +size 722271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f2cfce29ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90e3bff1ead9cc293c9836d1afe1b91b67996397bf15c3b550bdbd33c5f3a8fa +size 952341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..95a247e6b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc2ae1c8aebd873699f055925f8e59fad1d846d66d1e3e677eb16e350dba3ee3 +size 841037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7caba370ff..7343608548 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b5d88004e433d22acc66004232be6a74494849ebe055b448e7733c9b7252ccb -size 811947 +oid sha256:0ead5bf13be725ffce26639c852d036178b11bf21e95fefc468fc1d42f206969 +size 747715 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 69972b2804..2db0beb125 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3864fa260fb70c5a355a73fec0040a1f2f87749b8fffdf17323d8edae4a084c7 -size 643713 +oid sha256:77dd404a5532ddc94c84ff60f48ae9a46d979a3e0887f8fdf65ed9015472fa0f +size 644651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 39eba29b41..589168e569 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f392d46f0a5fedea228e99597b50435e999ea84636b7ba896bfd89702c4fa45 -size 762119 +oid sha256:a80d64fe5ab290a74604b4aa45222f6fb3dec13a14582251de75b6013d448d05 +size 707359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5d1634639a..e77f506ee2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09f28ad4add3eb62b2f4a983fa41331b89b7326d183fd89b087e503aa19a53b1 -size 606956 +oid sha256:5efa199d13cdc1208034a386aa3551ed38caa6c9df7881359b888b437504557f +size 606908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 75be37041d..4b1eb7d274 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf24cf498de806c4adc14721505b048d72fca3f7595897ebebd740e5abf96b02 -size 776037 +oid sha256:58b66f2f9d48cee342281fe94b4f1213c8fd8ce6c476755e77efa425868ee922 +size 772485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6497429465..77d9d1414d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:514da8058607e94493c7982e3a950ba1af35e377cdd32415006bebc1549061e7 -size 736815 +oid sha256:416584615e9ec673ffb442b2a9d90be48a59d90930c2b5e58ae0c0a0c7e78e58 +size 735335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d05e7e3cf..aafda64d15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3fd26e9e8190fecd35555dd62fefbff7ceed5a575ae800f5a94c3219286a1d0 -size 799267 +oid sha256:ad1945ae4e5a6fa7976e0148ac60748e73877c0da23ce0ab2037a0cc81dd652c +size 777315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 51b13da284..d830cc109b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12d34dd4668d695223078100cbf903e51f672f155e32f7cf76b6c0aa5396cabc -size 760095 +oid sha256:65beb9fb5b9d0192aee03a8f2716f1a97408939d7f543beb315c2350491ca1c9 +size 739425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2c96203433 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a32dd1a4627f85f069b1319233160dd0388ccadc41c7ef0403202475341839 +size 1054691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3f8e2ef758 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e85a3e6459d5a455fa0080be1ee96763ef1dd006f8dc4fcf387c7db5154aee64 +size 980043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4020881e4e..b85327f275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6779633385395f3982dd0245e3b9432883f31965ab5c27090fb824a22a8c558 -size 842469 +oid sha256:dec26e1dc53290fc9bf87aee73003ae3b4c51fed52d33b96f072985191b0626a +size 753621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 34639b805a..63f7c707bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b840d3cb016fa4cf67965e2593ec862c2cc7186517e2497f543b6c2bbecae2a -size 670141 +oid sha256:032e993435c1fbfc1305f146c2b5bb0128fb27b999b11ff6bcc543521cecbb80 +size 663135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3e455292b6..3d672f2dce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b7df36fe97bf7cb719a6e658fa1d77f2c8fa8c4e57487f53dede2c6d25dd3a4 -size 790223 +oid sha256:86aef0211acfb06c1e01d6e3ffc4b7099e5cd0b822632b69322343788f20e80e +size 707787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d825b912f0..cb0154ecfa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e6946d2e7ae96a5a7a469a3cc6394925b58f00c1453907fc35e0689c9a85176 -size 628649 +oid sha256:edf8c0d34b32383bbe9a0903717cbf73e3db680003d5e7a0d08b186c6878fbe7 +size 624357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ca8b1ae49a..9e355d7aca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:559e2277e1aae08a31d39cde366da63457ad36d0bc1791bb2556a979f906401f -size 768933 +oid sha256:97d5179456bc56a94aa8f63b9bf92acb6c35086bc15955fd2d39ee65b5d117bd +size 761633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2f4099d7cc..dc8b9cf16c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f902b96bd1a97ff0a1ce0a39145de24a6ddb7f1084886032978f663f08bbd5f4 -size 728873 +oid sha256:837855f3248e1471fb1749c4ae1d77096edff28eb92d948bee7bfec426c68904 +size 725271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b4280c3a1c..e1c40c30c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69839d1c33d449c0d44498be198287be1136df57cc4d0ef045e4025c5333545d -size 792115 +oid sha256:a4367b0818f0c77d172d085164e68bd5d0ee05fd4bb7378163fa0534ee2e0375 +size 767201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f827e6e46..fa86bf5b66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13f3e114a9d711160c1d682b6dae1cb3bd0d0adb7381579bf763311176881574 -size 752941 +oid sha256:4dcf98d070490a46660e4892db2da35e8605435e562dc632bc98b9b54ed31035 +size 728521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..adab180697 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cce7f492e2c67dc80f8b7e842d74692c1b479e63d44d70f5b48d3353cd3d2f8e +size 1044973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..462df29f59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5071765f45a61aa056124e0364b991f5d80fad380dcca02ad1341383498d35b0 +size 969979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8ec774ad14..d12b66d96f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85f83bb4a2987a2071f45bce55a8264fb0df0f17eaba3ec55f933d5fd306062b -size 836155 +oid sha256:ea725a226124aa31315310b51ff0a1ec687c2a8a21484a86c0fbd237f04c77ca +size 743901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b20a34ac44..94ddd22d07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76ece806a4266bad280a12828e5f7a43553f4f0809d3557149c5b5b18e865bf1 -size 663037 +oid sha256:400d7a50681ad3f51ef7fcb334bf491b3c074065001a3bb02995039ced7d4f67 +size 652233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0477a73031..aab3dd3150 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41c16974c04d13c52d9b24bc0c162c17a86c6371f6fa1971866e668eccd5ca8b -size 784551 +oid sha256:86bfd6fa3e0bab60d7fd045ed2ea91865b1c7bf3e0e14ee050536f9e72135997 +size 698019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0dda53b9d7..f977771136 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e22253c32bd47caef7fbb66f016a9b6c9837fcc90202bd674b2ca86cfb3a3917 -size 621495 +oid sha256:4829b28b0f00d9d0337fea6469ac9e741ca331f6d9b55f2f92778d1a988d3c6a +size 614242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b6978af676..50a76f7e76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94891d9b1a6e82ef0122ee986d800aa1422cf7b36a710991d22cba7baac4ef4d -size 1361661 +oid sha256:51cf6aae6aeb5da9dffa3892df867f386c493ad575b7ab057235bef1fba2c6e0 +size 996791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 0025cdcde7..e46c31d766 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f04c654a327a65c266c3d207622f70c42f30292962559de72b95addead41067 -size 1234521 +oid sha256:2b6d7745a22012c5145c9de123d7b39ab4a4ad0b1c6cfea617d92f5243db1910 +size 900485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index fea47f6922..b8e35b9ddb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8832493475604ae90e095ddd030ba275dc386d10ff65c8ad10ef576d3916b09e -size 1357465 +oid sha256:c9c749198d9f44912e24e8f64e606f1c0fcfbcd72066682b1358065466f69936 +size 996937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 45dd7cab70..3753d2f5d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f969ab5783f3ba53a53b6af02cf7b1c5d845ded8dcc6c2601e754e3939721121 -size 1266833 +oid sha256:88f6c47f60f735fb887a0877a424f3cab21ce7a33eca574b6cee95522310adc8 +size 908425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b4c1eae129..6c48ed83bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74196cf3f5a8c96c91b4b2c856f9a19e33c9164db199ccf353e18f9bc50d623e -size 1642695 +oid sha256:d7c4dd067bf301a5b75c2fd3e7bf53398085263f1d466394de3289250b67c068 +size 1122623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7c6da0392b..30ac36b0e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d62cf3ca6accd40dc4f3f5d2a0c0d7dc28a97dc44237fb351538627ab1e4cc67 -size 1515111 +oid sha256:f7bd2d50fb01e685473488dc42028721e5d17f0d676c8162958133209ff9a11f +size 1036035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5b2b00c25d..63eb5bf245 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21334b3c69571e363a90063f99b20d2dc8ca78e6d4b9e3f465df8ebc878017d1 -size 1478139 +oid sha256:f8c6410ae409eccee2a22230f31489328f38825506ad0f42aad26054ef8c73b2 +size 1048693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 355d657193..4a9f64948e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6493baa642fef2754d820a25efe027baa793eaedb9dab9c678bfabddb0e017b -size 1302751 +oid sha256:b046bbeaa28bcf072814a9b733485b25a75844f428f569b4ecbdbb09729b8fb5 +size 1006801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0014fc9822..2de6088eaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a78b13a7b6c1b2b5fbf3ffc161ac39a0fdd697635aa4c68b8810a415fb4583 -size 1463833 +oid sha256:66c0ef8fd27e9bb2af011a60431ff86a4b13f3261909593fcc3562b1eee93c84 +size 1029205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 04e0b780f9..4a0327c372 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80530e3e6921fa8068830305a97ee2484ed2b25f3aa36e68437308b36d1edcca -size 1289283 +oid sha256:9a5d4b6dc8051fbd6c81e97046c760fdf610fad81dd0999c63048d25a8c9965a +size 986425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f5650b5a3e..5ba2b6b248 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a877030b4a37f1b30ef7fbb4d6ba048ff5ad78659fafe2fbe41eff96c9ee25cd -size 2126313 +oid sha256:08cca59145a6b119daf8c17894a7498599dca35d1c86ff0169e6b4606fa55af8 +size 1191593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cf4a8821ee..c1cac15d2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c28e2f69661ac313f08567e5c0a99b74e92c983dcd7fcfa8c91af99ebf2aa6a -size 2111901 +oid sha256:d30389649dfdb0b9a796b179b1146a28b6a185ce8fbf34d4e8bdcf8b6a7da490 +size 1176443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 68cfb8f2e4..f55a17984a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4bae95f08e3a472da1918df4f779214dda6c4e81de5a69c5853f4197405bf75 -size 1367989 +oid sha256:e26e793402a4b8ca0a3b5f043f08eecfa01045866702c94fed45fb4fe619aae5 +size 1037061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2446ebc9b3..8768f574d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c5198b33c94e43fc8af86046de309ba755494c0b0f7a765c62249fe3f25113b -size 1281353 +oid sha256:40e8a905c354d00067e98a54202546662b4a441a85ee3714a86d697619be4ce4 +size 928619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 79130e6656..66d72aa85a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f046b2ebed19ebcf02b44422a70451ff2dacf851f8e2d3166db8de445882f43d -size 2112795 +oid sha256:82c538a9200ab987ab51079a376fff384f8a974488c6adeb0ffde20cefaf383c +size 1170429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e49b9a5b51..29501c6792 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53dd1b89578ac9e0628de6aafa2f07b0075e492d6f5bc8113b613508d8464556 -size 2098383 +oid sha256:9dc56db870557c4ecf7230f900a0b11e587846fac3ed52bdf3d23d02007400ef +size 1155279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6033b3bee9..46fc4cf808 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f936937451fe168c8497d3e19e047518ca0f79ee245b117b22755f4103101a7 -size 1353485 +oid sha256:45ae370b1312e597dc74ff8726def15a959683aa1dcda954540645019a4e1898 +size 1016391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 20bf5f33fb..121e01309f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a44eb47753dce5c1b3c81d23cdf4bdb6f0b6ebabd7213013cb4eae2d1bfb8c3 -size 1267885 +oid sha256:1e362eef9d4d1d50d18470d72670383d4fa33eab82803f4c94cbf18e8ab0db4f +size 907357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0c87cf8ec2..aa13121cc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1b9584714758d4a2d181c06067b51ed6930a30dbc14bbe759829a23f866fce4 -size 833479 +oid sha256:6a0520abba65dd33a4e5d298ab087bd4b66b525a0a0d512507c95f5393658e9e +size 845713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 62c846f1bb..72401b6ca4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d015195cd453c06874ea22047e6bec95d1e66489bf14261a38d0d1e7273ccca -size 751287 +oid sha256:4f9438a22627348551a559542733eb3b8a9eccdbe4c8622f8f68e862cbd2132c +size 764065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bd2ce6310c..999eee0183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b6908cc22aa4b84f8da5b0b11ee7786095c38b695d0e6a8d1395cd8978a0317 -size 831007 +oid sha256:290bf467cc0d54ad636ec3a13e4a3f5d1a9450e8a079c2d729df49038a5538a9 +size 844227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ed84bd274d..bccceb0b25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7a1e85296cad91be871488584cbc5f946517212ae8e02cb5ff6eb0056b0de4b -size 768697 +oid sha256:888225b72737460042905c8dcb6eac3a5d8b580b47494c0a0402fbd4fb8be212 +size 784483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b22ac17ff8..6505974390 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:118f4928bdd22a0d8c37c06d5b99ad48eeef9f39f5cff19c4dc2f4677ece2a5a -size 1543799 +oid sha256:3f4bc642cc0cbeaa08322f2fc9e92b55668305f96fb2ce8de9a940713bc64eb4 +size 1123973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d883b82a6e..09d678b977 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ee96fede4c26902662ae596edb2352db31f21127d5fef577175776a4fa5c934 -size 1334175 +oid sha256:7dea842b218aaa4707a10d5b7e29e793a73d713ce60624fec771a08ab2820d34 +size 1006059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a3c6680af7..5d914d93ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adedbf675610958a49d8719a68b556af4323c20a327e9241f6f6981d2c964b07 -size 921469 +oid sha256:1e8530740b268a511ae975fea699861477ec9643d82e3f0e0ab69a6921c11993 +size 847321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aea0f1687c..debbad14e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48b9ae0fbc63f1c3433cc4278166cf47728bc3c2a9907159636caf39c01f6e52 -size 707651 +oid sha256:c9277049c0dcd4483f833d94ea9eb1febb56e51b2b7fe32e4584f6edf16a4cbe +size 718603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d1c0d3c149..4226e6e7ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:370d98c31451dd409952f975807ae766ba9794ac632f975d93f18f916a3201e5 -size 846679 +oid sha256:0d0a8ba0425916c9d6ccefbbadd88315c471ec74db1c60622cd63c49ac461275 +size 762861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b4969f6af4..5e98d4db8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c51163164647607316760fb1707f9a8c88370985332f17f58deda1d8bfb65b59 -size 655059 +oid sha256:9de625c5eaf3d0cb051fcac0cb6f1aeee103e3fdfb77233f404525ba63ec807f +size 666949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f35bd4bd02..d9c45c18d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e01c9f64298df43db745f5c1d88955c9042d5d22964729f4a3c88e9f0b639dfa -size 819961 +oid sha256:2f01f92aa0f56491466e704f8e44024c0416d21466151375df6aa1314ca5c604 +size 822675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7328e0ea67..39f3db25c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95b1dcffe2bab60301c80958801c4564a806ac2b1e97cdfc514b756ce599d962 -size 737819 +oid sha256:240c24c0155244c652b4b97e028dea6017adc4e44fc48b69425526647deac74f +size 742901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1de2df2fda..d15aa4087e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9f56a8bddab51461fe40cbe3c599cf840688537aed0e279bb95bbb5f77c391d -size 816699 +oid sha256:9dc1cec282de4b3117422953843ff5ea1199c5efa81fa75d2f9b203578197476 +size 821189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 356fac91b7..bf186345f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d85d2679a81a70ddba84ae1cd83f5cb1c7aab4d054f4f8b5faf4f411cdb9e5f -size 755229 +oid sha256:c25f5543844668550488f2602ea5097835553134ec6d0e9f64c746804ee169d7 +size 762283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7e29932b9a..d4c376ea92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:736ebd4f51a08a254a3d6659f2d7fd4a748a8a568747ac5cb00cf31797240318 -size 1529641 +oid sha256:a157894b3924ad23851555245d7229770bb567ea673a6220ddedf2591816fd40 +size 1097875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index c347091fdb..996f82de03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52b2f63b3df84a95f05b39494215d343b6305b75a0922c925d42d6d1473e9d36 -size 1320707 +oid sha256:034211109e4ab6b4b2db3dad76c05716c290439c216a18705d58d17b3da764f0 +size 986719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f8586eae87..2942d02169 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:540b06c63ecc36dd9354502530306c112c0e783ab07c8bb30cd9c12a5bfd428d -size 907903 +oid sha256:bcbd58da0a2027fd1b0501f5142c40a7c1617fc8374de552b137b58ba64f2b7f +size 818905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ff85bff4c8..55fac57cbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cb2d71cc360da0a616833911835920da959058b2601f963c87a74002e78d67a -size 694183 +oid sha256:7119d105f5c2adcf3625a55c70ed67ac0a153bebf8499de32a5ccf1367b6e6d9 +size 697341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 730b2fb7cb..b0152db4ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9feb44882c05da90d6c19a91dcead274255679c96b3c94e10893997a07732ef -size 832371 +oid sha256:9b252445b196b7aa0d0d204ef0d73356ec6ab5f3a2036a097bc6dad6b77513a8 +size 742487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1c2ea6fda0..15c028864a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:415f164ac807f8e677d3e5430eb875b625f29e29ae280685b1d0f01daa3b4df7 -size 641543 +oid sha256:9b23605fe58af20299e5c4009f7b2a84d53f6fe0e9ba14325ff96d6ca61877ba +size 645687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dbb50af834..40cc4af1f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a9689fffbac44d27fcb2f9346113c966bd5750e839c1216df178d66a58b704f -size 865135 +oid sha256:b7ea308a4333d489300007b7b8bb55423ba0e73f0d6991afa94a073f686daefd +size 854233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7da52a5202..9e832a818f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33eff393614d7d00f52c2b28ce48ce1a71461be83953dc16a7f45aa60dcca4b7 -size 778061 +oid sha256:d3a6d76916a60148336c173ec8459629f6ad201b0566cdcd6c34d451ab914e9f +size 784967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 64a4da4723..2751c7947b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8709d693b1226750529edf4c707e3791fff2af8780228525a975e7d48b8fe159 -size 861873 +oid sha256:97947a88ca3562d7473a55fa7d114efd5373fc5725d45ad4b693844181121e96 +size 852797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5c8b6d0827..8e379b5adc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b705d641612b5972a48a6a4ff53838da959b4bd323f7f8593275a9de8c20e968 -size 799515 +oid sha256:d4b013bcca8b8c59404fa942d0347ff0f584b599534e5cb36c8b9251bc0c5de3 +size 790783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d1ceb5085a..8a100558cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d2ad310be2f5a751c18ea556e3c14ebb3758ff928993d6221be1b3002b1ddfc -size 1759963 +oid sha256:ac767bc104b95244e305c5e03f8435876b29440144ca8a6fd70174c798570870 +size 1178569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index e60d30bbbf..b4fafbb9c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce6d53e6c0cd71f4b1e8b020f8374dc9e4c05b09a10be1c90abbdb1727cbbc0b -size 1584427 +oid sha256:188924caca4b538d053b5c40c309454f5e2761d68269078c20439a62f97739f9 +size 1136283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1813826557..ca3d10efde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:489a431a2da289bc73b83bed7abfd60da6847e52c6083bf5c81bd318fe78edca -size 947847 +oid sha256:4d80c9877ede9eaecd2b2209abce6110f9f31287fb2b3fba30121ac9f4d1a4f2 +size 824169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index df7ac7653b..fd0be74689 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72258fb8f2126eb479ee2c0c945b4d6d98f364e61ce19374535bb0cba7d8c8e8 -size 729687 +oid sha256:d74e835612676b1d4937da20f7acd824122532695416654c66d5c1925b243a5f +size 720955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 80356426da..6aa5e54e15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47fd6d312a4c53c2725efc1e670fac4710e58c30f89b2a40e4fc58b98d461cd9 -size 873501 +oid sha256:85e124f1eb0cda000172f96e76a0ce0b9b33125a990b448a57cd557ed9b2e89a +size 764079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 15709a2ecf..408d1882ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c81ccec313aaf563eb383036a479572b0c25ea469c5eef58f77ae26c15180857 -size 674629 +oid sha256:0d3314d7e3d46669f8f224a8a2f8c0863824cd316366147f46eb78a81593556b +size 666045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 63b4dbc867..278e5c2d9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:394db56da8f512103d9bb02598b9a31916e068bb1d74ff99e8cb21775c07ca73 -size 851667 +oid sha256:61bb99a9edd1f6d3f50c3f1ede60874effcbc11f32a3f64aeb0eb5362635f363 +size 832033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 26cbc1d72e..9646f55586 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30dd4ca65fe27889dbd0340743d09573656c88811335bce961d76472f028d4b0 -size 764593 +oid sha256:d3517a64ecfae53495777b012b75ff07066f795bed77007f060904c0c55dc069 +size 762767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8b54327556..a8bc738c4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e29bbbe92a6a43dd3a259c80c5e09ff44db3110f689b0fb2229e80400d73fd53 -size 848405 +oid sha256:59c531c145407f89be45bee357ffae9d6fa669ea8322ffb5515caee4f49a2257 +size 830547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f602a2d651..18f47ef45c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d9eb1581309a8e1a9f1cf331953027c2f2ebfaf89d0130c43ffd3e4428717d1 -size 785997 +oid sha256:77e37fde063e33ee054df0c7b9c9d6a669221583ac0e68e922c5af64a3c85541 +size 768533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c9f6cfc651..3678c3bfb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff6c94784e8d2413a384b2b224fe278670a9f06c7ced1c2d92ef739a41bbc818 -size 1745657 +oid sha256:376db316ab2c7846f60c50ee5d48c9d1bd96ea8d571a07349079446ed9dc3dee +size 1158293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3927ea2ff0..1f9bac4e17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05f38a6734f5c7dc37c89a3d7875f10ec5acac00e84ba22c2a27c27829a1e668 -size 1570909 +oid sha256:7b9fe90eaedf0a1c744e679c3b23614d6b025229b42ae51c95f079999396c84f +size 1115957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 53d108b225..ca1b522d8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d8c6177e5b8c0c0e96ae87e15f91334e7fa1f802c6e355f9dc69126178e62c4 -size 934231 +oid sha256:08ec985ea862d8d5f0453baee0aba3c95e14b4dd8d081c910576a78f58ad7ffc +size 798171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9078d55692..87c8025273 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a3c535e21d21af6b1e1eb2be7a30a9f9cd3dc885cff059251fe9408f95ba067 -size 715479 +oid sha256:fe4757125c1c43d6c222df80347b8033c7322a0cdf288bdde73b47fa1aff8626 +size 697719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cc9daf7cc0..897e97df33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfdc6cd0b49bb0a59c174bcb05f5298ec592e10ec51dfc6eb495d551724c23b3 -size 859193 +oid sha256:1097aef0bae7b6c5c1f716249632681983b0976ec404f47414cbacc042c86eef +size 738081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 88d24416d9..3fcf78f573 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7cdd82c57003a96835c024c223253666f3518153157549b1420f2b73ab08091 -size 660323 +oid sha256:22e09b02a0536a416f009e804695edd1a9182b20eaae0a0c20d8ed0b8696abb6 +size 644635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f77238f66a..0039c8a752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9a1c2ba4915877a288c9cbbb56ab7e5da48f6e706805afc53b03fa752712350 -size 939663 +oid sha256:2a674a23fb289801854d2f306753e8078ce504d498156c7d066915a2534821dd +size 753183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6c18b06842..1413e064b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a671a1a754e72fb22700181d592038be5e72df865a75e5f0400c5ea9c7a6aa64 -size 834871 +oid sha256:8f6bbada16722755be714fd8de3510f532e7da4abdeaeda748bf9b360de261b6 +size 655841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ced0327634..f181b10409 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5dc8913784b9fff51fb4737192df3bf2c93daecf2e2afd13465bb0a6d75ec7b4 -size 940007 +oid sha256:b3ae648806d7cb1062a602cb9ccc4fc1745b8f6a391041644f51a1175cc59a58 +size 751061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 06f016b2c2..b757b06121 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31320c3901751c900d6213b980c3879d99abd2a9bd066d4695405cf34e82ef47 -size 889729 +oid sha256:d96878c09b4808fa3e71380aebf162d8be5e97273ab57789283ff4d30b9a883b +size 673253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 623c0c5601..9f68b5639c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:617089a1cbb6ba3de3ba2c625bd12ff1af13eda9bb64baa3bb26d4cd7046fabd -size 1083651 +oid sha256:483e52de9326c8cf76999d742bd842f78c07fab2ebfd8852bc0c09a758a33b32 +size 821395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9d57e6aa3b..ba5fb6b0ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5082ce8a8abd5c52c20d3c269045ea4d9adc34b55ede2a5af0322ed82232acf7 -size 978809 +oid sha256:e951c2b516fca26cf8ba6eba18b5d713d12ba9e4b5fa7d644d3521c57bb08295 +size 723509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 32091c5b78..e928e51f60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78f0b3e5d9c0e8ee1415786b930a7f9c419d7e4d63490f3512f5ecd04d5b83c8 -size 1043759 +oid sha256:4f65631fd33c80265964c8931ff813438a946a959bc200cf247b95a6a3896c57 +size 864925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 56d8854d25..78bed7a595 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce6898c168d04a8d6b669a6a926ca28c9d73b1029b5360627980548fc11d8bb8 -size 923821 +oid sha256:8c78183a4e12cdaf09d2cda304fab7e1226ba48b7a032bd78f15ef067bc2ef52 +size 811737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0105d3f6a7..faadb58daf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:447f7d58b8d524f819e4e61498b795b9f7d27ebb47ca62f4c9c0f0d0c84e8483 -size 1015885 +oid sha256:0904484624b24c2e93504db673a5e147f2e80bb5c8d30d82f1f3f0152f5d1445 +size 819637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ec7ce69071..082e8295f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:132f95605884e9f17170b625ca879411648a93c64780861f773fd61fadb4685d -size 901573 +oid sha256:f868aa662b4064be8123bd11461d0d4a4832244816b199a00249cd128f79b884 +size 768817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1deb3124f2..809c8bd9be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:761675b64d78c6b88ed81310ac2ab0f7099d79c8012f21149e4f1ab50ae501ae -size 987053 +oid sha256:233c7476c971bf51b340c294b2655884b995052a124d8f5e38cd674d7c1e7f93 +size 1035449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3b0704c092..e6cf2a7237 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4b6e4b20fed0f00b2c96c7d8341e6d7c8c4b72f31b9dd5c621a508766cdf836 -size 924349 +oid sha256:72aa3acce9b7fcbbe754d8341301a68eac995af84074f8964fa00a9ebb956395 +size 949361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ff5b26265..ef450f8c7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b00d8c9d648ceba5dd623768ac627bf68d01f444523d032a3e9ea59c84d16b6 -size 979845 +oid sha256:7bbed93982fdee545bd45dc4030a4adb50b9005a72d23cd4459072adffdc261d +size 1028339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 39c78845b2..8de4fcbdb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93582c21dafef1e187e84957245dc090f757cf08847a816237de0dee9a554f41 -size 924639 +oid sha256:b6f7c279056dbf109f7820c112e127e01201535d7936b902e9b8d0675de6aab9 +size 975797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c205eb0ae1..e6c1b8bc70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b85cad34e488e488109e3da5375adaf2baebebcafade03518201fdb63928b666 -size 1113811 +oid sha256:f9c7ab64f60c551f4d98b6a06d1e5e0eaa90b1b85bc930b37dba0d052524c82e +size 936605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5967eedc33..487578b788 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9d27681046e8164a3c0382acd5061792789a0f8cae4d405c11165f591f5f2a4 -size 975175 +oid sha256:f8153a201a2b642b8e11cef32f3ac7d9b2572ccced08c14ff88569f6d8c7d2de +size 832849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c1a979f5fa..df87f5c265 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd7edf48b2b8ce0b376e83914d2eaeb4e82f3ddd617a48ea00b7ca400d891de0 -size 1127437 +oid sha256:fcb146b883cef2fce0e3a8a09e624e59616a4d23c8d57d1791ef28bfd18c4981 +size 1010813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 36c5dd997a..7b84430ecb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e69c7b27127568f5f738e031bbcdf310effce46a893726536f54b8aede63c73c -size 874151 +oid sha256:71fc11d175fc99bdd5db5dda12c98669aee0319f5758fd77b4ed1cb50a033123 +size 914407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 35688f2af3..6141825a80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:858f9ac45902b53aa8d2fa515af7f0f77b4b629f7fd137fa3f7d40a5eba610d1 -size 1071589 +oid sha256:614d4a287f5cd220b261551df9e0734f155beab4b81b78be86378ee16a77bd8e +size 920925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a443d08870..fab957e49a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab4e67e734cff92a5d79f72039ce5e4769b4ad0a350c8a803925daa72b3cd609 -size 816675 +oid sha256:f9140d69868794cd8a202247a8b1d8a2eb4b2a739bee9427bf3bc564d94ccce2 +size 841391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 63413aaf32..86110ac1e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fd3cb1ca563f9af226b8ee8d514b636a175e4573bc02e290e11c983d45d2f12 -size 960167 +oid sha256:15c067a6389f55dc8967d76a1a03f5f66e0c813f45bf004362b89b5661a6c4a3 +size 988929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b655b9830d..d8714bb45f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a645ac012a597ec555d45d3b8e8d6953e9164e2baccca3669c68b1b123551bf3 -size 897315 +oid sha256:57549fe169dcb21a10d3cd412ef6b8a40929ba32c5ddbef16360c36dcd6a9fbe +size 906491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3edfb0bdd2..1ac1279aac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:651a312b79a70f1b4e30432368b6861f22dd24ad5a90f654555291a29ec9f613 -size 952811 +oid sha256:53763a7cc490cd2625f19fa728f58f43868931aa97d94c964aed18bc0a50883a +size 981769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e38e404c9..c1021f082b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eec0b3a1473de5dc99da29d7cbb3f7950854fc000fae2ff29f3e5e1a5c2c9267 -size 897653 +oid sha256:e8b4063e53c4266496853e19253c16599c3520cdac4c92c27b6821c490781e7b +size 929227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b08b63078..0c9b2e7515 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1eab7cdd7f211c76fada62773137b07bc41fbec67f8d5b93f2ee23c599c1b033 -size 1085247 +oid sha256:a9bfccc9bc3602830232e137c20a3ed8774ceec60ea0013548219fe663a7d70f +size 890527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 46cf695e22..93f39cfceb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afa74742d87ca14335428c737bdd6789bac6d99e615e5dfec2a1c8c6ac1593e3 -size 950559 +oid sha256:4c0e361f9f21446f2330890e61f87d94bda62aac67037e380c67f5e388cb5260 +size 789485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5ecef077cf..bde3023bef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94ef3e15c61f24445e25d085bea86f04e9766ded414c5feacd9f80c9b2b930a0 -size 1099761 +oid sha256:2c32049c2011175263a05e7127194d2081fb6818579cc4195e7409def51f8190 +size 959999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5626942dda..68f02085b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57741fcf96c87f7e2809af59be4b6c72b5eb2420bfc71de43381f6db6c6b61f4 -size 848053 +oid sha256:f36508e682234c4dc9db0d0d09013698a77ff1f6bc650c061044f3f8ebccebdc +size 868329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 876c4461a5..ad6b3b0105 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e317cc3e88f0a5cbb48c0d92d74fbe3a063af094bf748d8e0e23176825cd874 -size 1043123 +oid sha256:28e4e8b23d29a75cd16af3e635d9605c5c21013b49fd8caa38530bc50e46c8a4 +size 874847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a36bd372c1..b4f056efe5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f0390b92225f82ccc731efc336ac3be671ae647152561833f5169577d9598c7 -size 790479 +oid sha256:2cc602eecd9dafe68879e2adf39d7091bc757c3762332e0d00aab39947639250 +size 798027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5984ceb080..042e1200e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7820070de3ab62d77f6950e4f5c805b9ad95c0e4cb895271edc9ba71fbafe977 -size 1015603 +oid sha256:16bee80f26247ade54abe04bd9ccc8cff7a23ba9e1d7248ec18d78d8ff22fc04 +size 1044807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 433ca521b8..fc7d93771b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27b29cc3e73e56f9024b8a4c0a19ccc43cbba37b6b97e320b7f249c263bc1af8 -size 952009 +oid sha256:f38928dff838eb8e81b04244b50ed311ede3ea2219e51381a8701f771ee2804a +size 981413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 756645ccdc..2cdd271579 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:253adceea6fee6416847eeda99ebc0e39e7bd34694a8f6edb49dc42d81234504 -size 1009085 +oid sha256:bd98744f9c7df07e1d9b2be06094b2a84ebcc08da1410e3d434e1d714d916ddf +size 1037697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9a03b683ef..d89aa15e2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10579bc155c55a634b6bbdc46ccb0614187bece87316553d3a8b0b42dda52e18 -size 951511 +oid sha256:ec2777ced782200a656b352f890e74204670d9b2ab3dafbeba8a50758679131a +size 982837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2e9e4f1eb7..8a731e903c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df0c5811aa15223becf13184cbfb5babc31edc049642e64bfb978b968eae6e0a -size 1187845 +oid sha256:5ef29bc21720ac41dff80bb39f7700ab2e4589fc318d5f1f0355ba79aa447c7a +size 933383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bd713d5f28..040116fad7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:200b800bea6f426f2c4cdf1e406db99996fc20c833c335e8933c95c9e8bfbfd1 -size 1067365 +oid sha256:7bffbb1efeb29ecb5f412abef92efab29856d0f94a9b75ef7af4edd705a8e55e +size 878713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3c325eab38..5e42826204 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7514732999e89fd957c5382e759a272db1012e9de2891c982a6e1da9ce21371e -size 1152235 +oid sha256:3710af707a2bd510b9c98fdcc759cd2d7e9600fbdd0d9b6b54278dfeb75ef5d2 +size 987611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 14b45296d0..3451921d4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e9c9ef9271863531d4aa1c758c681e6fedca2a461a511fd96286b2eea86c957 -size 899443 +oid sha256:8e931e77c976a3c65bcef5273fd14e844b5c93e3431bad4db1763fca31b1037a +size 873247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b02830b996..baae748438 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cd5793d39b73441b57d94e837468933eaff443fb9b3b98d33b4a9af696d2909 -size 1093823 +oid sha256:c8b3bdaa17cc117ffe8ef0e562cbfab217a16581fb3f03a1f6ea6e3a35bf2104 +size 921551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2df66f8c20..b9a1878cb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad7fbd2c14c2dfc5bfc0f786e49ded6889890baad5b9c55db2b862865b60793f -size 838713 +oid sha256:67c4a0a50c9f8dc1b18506e4fc81fae6aa54ccae9ef5de96d428fbfbc5062444 +size 816069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b52d452187..19d94d1f29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf5f2817db8490eca4e8fefacf8b82c4a462b1df54b59a5752704b72cefd4857 -size 989407 +oid sha256:50028788ece365728b8ac58a8365cf301f392f91125791b032e6648cd814a70a +size 998287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f823e07303..69c4171417 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ecdc41b988a1a0b23fde8be4f84469840fcb33047550e9be728a17c798c81cd -size 924975 +oid sha256:727fc27567efb624819ab4b9037829b0ab98f0b0236b4cc34579bd6ad3fcd120 +size 934841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ca95766aa1..fee0c6e1c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4104d4fc3c027ed04598e4baedd6de356a56c53723750b6c8624d5951b30a3ab -size 982099 +oid sha256:03223e1c06c64aa10fc5b41f2699a429a1e29cc8c1a6850c1b57d250a9e37953 +size 991127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d466763e8e..411cd39042 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f86c9b63caa1b31b78d9e831419e491ced213d2fc68e6bb03cad7706b135ad7f -size 925315 +oid sha256:761f93205cdb2869faee998d04c89f85d20358890dd8637cabff4092568949cf +size 936267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 57c882510d..36ef4ee4c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d075d28547edbee3c3424a421f54a130a5b0b88af1c05b28edbf7f4539541b1 -size 1159181 +oid sha256:fc9662774de1c4e267370190959efbb26b946713fe4a2c6b24e0ceae5c7c68a0 +size 888933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6989fb5fb0..c61b3b1caa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1278d332746a1d6ef9416c7838477ba0bb29f6224cd92cbeb7badd6c6cedf384 -size 1045165 +oid sha256:d7ccfe4efcf72159fe573bf5916485c9376e0dde6345894e53c1aa0868923ea2 +size 837717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ae660350c8..23e5108e6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24473685fc6dceb766151bab96b3c9e2e006f3cefceb5637ca2d9d19e59a9799 -size 1123771 +oid sha256:9ebaca83ef29b0c8f8a3c9cd6e79ed21ce6c64724c17812dbefcc57dd7e5cd49 +size 942175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f42a9491d3..2262d9ee3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6ee80b6a7550b494473494a86255c545ff714120e437a6bbfe35ba19beb7860 -size 872459 +oid sha256:ecc8c633eaa0debc430a106d1ea7d7eff1a64b08d53b9347fbf4d1926bd1cabb +size 830427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5a415b9551..62b671220b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2bc0d3b2d290904bc73e5f0fa7258ac8af729820c2506b86156e4c6237b4585 -size 1065309 +oid sha256:90f250550090af0e575abb3ff8367da67f92a430a15a4875229a601b95342b1d +size 875277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index efe9af457b..344fb7b6b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:105ae1fef3b9548703012cc74b4f5f87980e4af97b25405e06728ccb656a095b -size 811727 +oid sha256:f56c138c90d73b319ee2e8fe6d9fa0878f2ad13f7fff0bf2a5838254a95753ac +size 772507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0f74356ae8..1be60bebf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efdbdd1e558e01b94e1dbd8fa112f9b0a1400bbd4b379e774dc3fd4e9dbd004a -size 1270541 +oid sha256:fcf67f5fbf7f9ece9b1dc13f2d1aa5b2da5450b9f2c27bd104002040c6007897 +size 941241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index cb502fec10..f6fa0e41e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b2992d80786635d3e11e0a2ec9455b4bbbd2f5599562eac81a5683e672f5432 -size 1157263 +oid sha256:9f9965a6153b22b37bbc92bfa5737f679ca65db6596d649f690524894a4215f3 +size 856429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index be2ced200d..e69b837363 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3674d0419ab5fc1a6178e1b65f11e403a5fac749b59309ce93f667c031b9b1ec -size 1266937 +oid sha256:761ef3099c838c1251ab107b0637f40c82d3370a502586b516637324010b172e +size 934135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 318a04c86e..fb12138fc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5031b72b20595fa5a91934b91f3acd7c7e7afd05f7d0250adee5550242804c4a -size 1209011 +oid sha256:196245e1b09cc5a6a7247ff69155b927f32b3a861e3e4b355357b131c54319ba +size 828455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9967ae0e3f..c6b17a0d22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:030e199a6cbe2c6c17d01c89a2fca1e100c4025c820d9f716874f062bf3cd186 -size 1552365 +oid sha256:51d4be02f3e796dd06f9585e4931e8f072459e8284be46177402bc6d56afc6eb +size 1067467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 553f9c53b2..e109293ac9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d8e6e49f79fc1a0986b44c69a35915b97153a38fc155a2262333ef44852868c -size 1437853 +oid sha256:2c9ba70845ea02c57a9e182468aed78eb2bbc9f831f50c11fd7f7eca37738b31 +size 960307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 14ef035073..8ac567845b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2081790946eab9964f473fb1d621bf6125cfc5ffb538e86bfac9b5ffc43b1a89 -size 1361463 +oid sha256:ddfc17cd0e8db98be3571ccab1d6a5dba6baa147267975f61d6b7e5258e809bc +size 948149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b6af244bd7..60b9b693f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93e088d3197f0ed54e9009319d2944be5800d6073511e3bafbea87563e72dd30 -size 1224111 +oid sha256:2f72f465cdfdc8dca556a31108162952ec13ef7885f1a8d3acde5e1946ede9f0 +size 901867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 83c3caebf9..19520a3731 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ca54637ad2e1a5191c1fec7bf1aff5771493712af22cf9efdba8937c982628be -size 1355839 +oid sha256:837a6dcbc328a5e8338287c18adaea22637cdbfe587edec9e6143f2b2c0643ee +size 938331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d2a413df06..d11cc17172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f76ae45cb3672c73b56a034063809ed1ecbf102f8711cd0f04bb698634f7655e -size 1216959 +oid sha256:ea6d1e8a6a706af3310a373bf9033e89d6fdbd2e44ae66830f89f4a992bbbfc6 +size 891803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1a0e0d2579..188a69c706 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:364db0e57ed65b1ef264d5684f6b37edf9c493104caf40b30328bd86070d8c6f -size 2026163 +oid sha256:f8151b0d09ece04a05828e5f310ff7e8eb8bcc6f1b0f826a3af8bdeda16c172c +size 1083107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1d0b53a9d3..9b228ba3fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e327187454052ef73ed75a82867a275861312ebd466c84fb79ab00e1a60a9edd -size 2013479 +oid sha256:4a18b483171629f9957531914ed24f9f4ce6e743c3977baed78e9ddcd4f14b30 +size 1069437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ad080a0721..aa6d14380d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f13195a6381a918e516b9ca08dc0be76d62eff6c47e2d68c1195a30c8165f96 -size 1278743 +oid sha256:d81583afd0c2fd273fd6d0087b9ee5342cae6804c548bbfdb1c3e257f9132c3c +size 949493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9a0d7f3b05..7fec48b634 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9de48496411510158ea69dc9b8f6f59808c1502feb5f928a0eebf7e4cf8af88f -size 1180759 +oid sha256:ed314d9cb4acfb733923fd500fde811bb989481e87961de6ded57b3111d700cb +size 843171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8b327249ed..2bacf74f75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c916a1a49a4c1a6a9f796abcec1141d4675e4e297a6e49b7c29233e5407c0355 -size 2019011 +oid sha256:43e59695cc009e528579bee271ced51d61052e79f101dcc78c897401a68a38c0 +size 1072205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e55ef737ef..5e1b4a60a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cbf057b106acd42d5a187a97a870c98e4fd25bf15503ca8714b264f2c4638d5 -size 2006325 +oid sha256:5785203698c06e61412776593a2ab4816fd0210e95c16b53034069b7d557b097 +size 1059373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e9efc10777..80e21cff91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53d6ae9236d91d56f43f79ae5bdf669bd2e13a53c2fd30c12da9bc6fdd9c4121 -size 1272379 +oid sha256:d131a6458535f8ba88c46c8a914665a1a3a4ddfab4769cade6059ee8bf084e30 +size 939775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d7df95e048..cdc633d474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53c865d6ea1eaab0747f25071a04a8a7b90fe520ca65fd0869c93f4fcc830422 -size 1173607 +oid sha256:ead6c2eea4ea4bc0d0d0473ecb38e9ceda7260f2b6cf95b461d1901f4c04680e +size 832319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 67c132e0e3..e2f171051a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d84dfd1d2a4ba06053160b7ccc454583205f139aa9b2b6b83c8106a8c244dd2 -size 763571 +oid sha256:41f1192fb415ca972147ab5c89fe5feab064553b06ad74ec2beecdda3f96144b +size 767567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7fc9a23cdc..c4ec7c4b3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0cbf603b638a80f2e50f4af44876970b9f8769203d07da6502bc23a68d094fd -size 720995 +oid sha256:c8c82f7767bf909ba981232e050c27ddf7897e819d535fec2f1dd04c26c78cf2 +size 727557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 430fa02d28..bdb123d69c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a53bae8062065aa90e5186893608bf57edb65a2d1c4c8b50831eaf42c62f9a4 -size 786653 +oid sha256:702f48a0ed3e30542a37b284244467b7793a70e32611c8e7355ff8e7e5fda39f +size 786901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 561a55ee15..45b1389f3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:678b6885c692984c9aa4df7d47ecaa86fdef30eb9081d4c517f9aa1e319e8032 -size 742597 +oid sha256:9ed0202e3a8a82b69143176fe4845434080deea06d581dccc1604b31ae48dad3 +size 745409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4e91c36801..25e9dad6be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a97766bfdea9869f49848d084ba2b651d71ca9461021e0c9c48caf7de2fd8e4 -size 1427765 +oid sha256:2132f1a98c23565fb2c5733bcf126e61abbaf9977f82c8c9bedfb8fb7babd049 +size 1007545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f6ee4a3ca7..8488076e73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:095d55f37a94e21a24fc4b7807a2aaf9e99e0de3481a5b2cb4f347df3c87feda -size 1274183 +oid sha256:1aef810b8968f0705030eccdd202605f7c560e16377056c613f8c52bc3e62647 +size 899497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 912a972dad..7dd90f3ca8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e94061d7fa63e13328771be4c6a0faa24a3b2a81e3227f50bbb985d959e54473 -size 865375 +oid sha256:35cd50ea7e8b610141bc053ce5c866baa272172a742da0eeba00d5035fc4637f +size 774651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f124b8561d..2e37c023c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d32d19aea4962167d25e5dd5f2107066a33d8ac32c784827c181961a4566541 -size 666603 +oid sha256:cb4e352feb6c9de5a9cca45015a18748805561b539c5c68984745ee793c83b8f +size 672819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f4c1bb2464..b6c9d65401 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:393dedca1b6d8666995f954f6bab887b26b313ca03e02f5adcb337a4b8e02263 -size 798033 +oid sha256:dfd601ad436c8b6b8f70419b465f4e40fcb4cabe59fac809ee479f465a5f988c +size 728177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0042b9913a..e8928e2091 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14074054ad6b5ea812c5b42d1a8c2206cf04f0ea31b11a1ec5057c0c39adb8d7 -size 624027 +oid sha256:eae45c4e0af6fb7c03e1183bbae902a8536210f59a4be7524a1a595bfe107cb7 +size 629553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4517e803d2..ced3a52416 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d482dc95465bdb99506b2fd81e4b3044f0863077991e160bd3c19ad3f2e3e26b -size 756419 +oid sha256:8a1ef0e42e5d14d29d4ebab7955aaedcbc1287e56c575733b6fa707ae4850c66 +size 756665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7b4fb49908..fe6160a019 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:329b4489fe175c62785638f70d3dfe8abbe48d508921c277859c351811671011 -size 713053 +oid sha256:62bf9cfe670c5a51273cf28ff284ded388ee48a36bd1f5d92244801f75952700 +size 716703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6672ab5b9..60d8db1a6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:741432f5bba5fa67dc8497c1c6915b9b4c935ec9599e14459ba984e4cfdec9dc -size 779501 +oid sha256:3a96ffc9934ecbad18951ee86e5f93fb6276db3b4ceae74364ceb9956c99caf9 +size 776047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d820e7452..15c3cd6731 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:053a02d29e95937f49abf1680f942d08046b23e0f4b29d05ea0529b157a92c9e -size 735443 +oid sha256:1c0533245d8ba5d198426f7c399656d44bdc3cc81cf011007401ad4af1eec5c3 +size 735295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 43402d5d92..80c30277e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82b4d2dd2452d5c2636ee5e0c1b162474b3587e267e13b62232569999d14b248 -size 1422289 +oid sha256:8dfed7aa366c2ddbb04cf44c141b44ead43348cc1395a59456ed1f1981a0b838 +size 997777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 7a882390b7..99a444694c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48309083412b0272bf59ee18b9aa9676e7ed952834deb0226f957bb719dfb9c8 -size 1267029 +oid sha256:b84c8bd55d6816af7789e8b7ca65889d789577a293ac6ea5ae4b88a108a5caff +size 889383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c356012c88..da94dd42ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c09f1f9d9d2995c17cbd7b0f7c0a32af52c2d97073569ac2703354bad99dc4b -size 859703 +oid sha256:6a6ed77209905de5403b729c07cea4895c53c36cf758861d1000f8fe205e4a42 +size 764785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 17a99eea27..9cc8c96936 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e16888029ba205c1dc3d612c36511b71942ca012fde107dfc2ee273874712ee -size 659451 +oid sha256:fb2120ee8c89237b577c0f2974d239f45991f903297d92b48662f1527f7f4126 +size 662707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 665fe29246..80e209a188 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c8cbdbba464fff9e55aa29fe699771469c0566c14adaf016a943fc1fc4d87f9 -size 791719 +oid sha256:21e7ead4cfde369c8680e2b8a566cffc45542d3ad58f4315ac99d11619258e73 +size 718459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f755cb673..be31cc6637 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01f650c0e30e88eb73c55cedac9770d304cfab5ee8204d7e05a2e59e71d7e6bb -size 616872 +oid sha256:eddb14194e681b38821d116478ab696a294b1c8e1551a9b6bc67245c3fc17165 +size 619439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ea9c860752..ca17289059 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19e1610b1a282d5d9807d797429ec9060ba414b192b52a74cd8c10e5db5f0c43 -size 792021 +oid sha256:0d72977d883d6734148bcaffaa4543763991201ec46b6b36404f0e769d3ea7ff +size 790443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2eac86b659..cb6568f152 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42052666e4b7196239bf5039aebdabc703625c4b63477084387fae8e458b851a -size 747767 +oid sha256:b25bc8581ebdf979f23ea1f973847543ff10687c10ac20e7311c17d418c07e78 +size 746583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3d25082198..6209247e3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e047ebb283356a81343614dd66d95a39970857646c13ee234ea24a5a49811168 -size 818557 +oid sha256:e4da271ccce714e51d66eb7e2fcc217288820d7fb447748d353054803af0bb74 +size 795469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 323cdc50eb..2a45ba755a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad67112d305c2fe4200107852158a58e74e6b8ae371cee6cc75d2bafe6aa32ea -size 772823 +oid sha256:c63a909d2433bfd13b627ab3b2f48956bc0beb12b8c94d630d7556b4e542d427 +size 751659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 92e93fd59a..00b1832c2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:188ecd3286f4fc960804d090464601a077d63d5486cc1d6cf766326e50597f20 -size 1639785 +oid sha256:fad2dcfa5edb4b58285d91ed354ac6de636f1d99ab21c1d55850a9efe655a64d +size 1073537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index a96e5affa9..1544a75e01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9283c2bdf7e46c7f2b88c308a23aca8039f3db84770ae74153e5df9ad95bc85c -size 1502433 +oid sha256:bc1f6792f1b963dbba48df1f07380245c2b8da05cf2095e118e6ec67e44dd5b3 +size 1031349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b35993fe43..b2072604c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27813be6e55bcdb360282fc9ae4476bd7221a70b92161b2ccc3ca5849e9ab0c2 -size 892445 +oid sha256:7d05c8d0c88d509461c6aeb549b56c1560ce381a69d36adfa2218554aa4d18c4 +size 773157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fd4a155b03..30235a3301 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e58b4254d6ae559b6706e4042e368832e09800307b9b8611dad18fc8f6b3fc0 -size 676701 +oid sha256:b151a13fc185e892d56d6f3db45d9d3dc339c22b87a7c5e1f6028204bd5f6f60 +size 681881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 11df624a2e..c7705edcde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:977025c96cb2f21aeee3a53c2c7d89af9b6e2c77d38783da5ec443ab0b591afd -size 821995 +oid sha256:7ea6c8157ca81fc56ac0fdc13403fdcde104c1e2bb894023f24065cc11f46aaa +size 721207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 86c49c5a8c..0852ff0082 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:424c8fcd5e6101314a6c8a2db56d782b82199e27187b37b2f2b08f3b3036c6c1 -size 630967 +oid sha256:ab5af148746491b427f070ea476d1adbab88aadcf3be62d853f649618f6127c5 +size 634371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 195f622c79..35493704fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7f47b2be9606b5c9691e1b31fb76f89df55af98751a5b306d46d3c602461e3f -size 784869 +oid sha256:9a501999df52fe96298f548eca61741144c6c990d4695a09373da4e703a3c5c9 +size 780379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fb58e1985f..1ec90ae585 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52f5933e48fcf64e82f1fd7b28e54f034fee246f91db2233d88ac4715fa813ea -size 740615 +oid sha256:db775edcd68ecbc02908fae499939e67cee88a8505162950e688fdeeae414b55 +size 736519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 14cd90ddfc..933eebd933 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:484d55af3315da6c3cd982d21c2fb0707ccc421d0f1c3be5cd9e66a8dc41fa7f -size 811453 +oid sha256:51ea600190d09cf1d11afa19ae72aeac6bf63e2558a88b42456620418e109684 +size 785355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f26fb143e..2d7859a477 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fafa0b2f0ebe995fe6c761f42a779761866b0cbe507d4673a55b3179135f749 -size 765669 +oid sha256:6f2155736bd689cc372f1e67d1dba9a799b570cdc8314245e8c7d01204b54e2a +size 741595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 2e51ca772b..82ddf03719 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3dbbf2d656e76029ff1a18028b14d6506aa53686d0a5019650dc4631265f51e -size 1634211 +oid sha256:ffa048eaaf155732e2f6303620254330fe9e8f2fc7cbe4d04d21c032c303dc50 +size 1063819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 90d9008119..30b465f1c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b48c331ada7878715678d9694dc4f93fd7558101694053347da7eb995cc37306 -size 1495279 +oid sha256:51640cb7b1be83e80a1220f295a3f1ee59ab9993353f76e0090ed4edafbd5a87 +size 1021335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 84af5a9dde..e712cba2c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:994e913ebcc302d3efccfa09fb1769747dcc709c36820f00652455f7f8f804cd -size 885981 +oid sha256:75a47363ffc6bd265ce03eb466ed4252129dc079b6cb6eaf3f358fdb3c4625dc +size 763437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4500a86a56..fcef9bfeb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f8fac5154d653580709aa90900e8a4e33b8e8f8c46261a27c10abc085386eff -size 669549 +oid sha256:c9127d85179cb6a31c2c5ac3e7767373e3618f0f66cda7f7f71d813e6c575ccf +size 671817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index db782830c0..beb19dfb12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5b26a0e6683704855208aad1a51e5b9663951e66096ce6706b1d3b4edfccd6b -size 816419 +oid sha256:addf45b193e5376865bb5159c03fc02e904f163b2a14e84bbaa27628413b0daa +size 710699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3a718fd72..cbd52f8879 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d50b9a517f7661d4e02fba13bd724672eb0600c20860d0e91f69d4c9941bcd4 -size 623815 +oid sha256:f13fce8db76393678316bc0851de4b31bbb32f3b7f2f57d4dd5e3dc99db70aca +size 625295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3d98e641a3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5237695383ac6ebd9a09edd1310f01dd67f969bab298106c6c45c8c2afffd7b2 -size 1377709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index cc463de225..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:932169fb61d0bff14d9c04050c73a1d96420e6344a0086fc487f6e54663e12b7 -size 1257525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 596dccd77b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a29e32bbfb502bdd1a87de3aa35fab1979dfc06ad331f8ff5c48258c74b8bdb1 -size 1374105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8a508f19a5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:35d3a101bdcf37ec6e0d0debdd6611c5c8574c4a0e7e5a94261aea1387d53b97 -size 1328513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f1da16828c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b864499417a75d01e5df2dfbef415115f074d1b2d4a2457c58d1edb80fb46dd7 -size 1593099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5a7310fac9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86e1a7f5e9d3d4fb381405f2b7597b00d7d7c1463e63f1cdc4d5498cfe119854 -size 1353331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f5da96f52b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7d93a716be034cd35acfb9c19ccc943b34fe5a6af91600940b49f9d66870e3f4 -size 1572331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index aba58d34b3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de48406daba3eae23b1fd673e001bfa45cdabd0d977e3eb73d3dac83b24bf305 -size 1331871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3a1f435a00..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3dbc65dca90afdd9f4c269c0e4d685371881e5620d046a7e9da6ba0a9aad2035 -size 1658761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b1932ad38f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dd973f2d674884470147e91ee3530f11b0046635723a7013b5724437b6e7c150 -size 1422001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 18eb6aad87..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a357a26c7a15860d7aa3dd223ef122ad0d6114bb09a52f3563e3a07b35037bdd -size 1638041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index aeca716f00..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71be6d39bafa47882a3adf5ce7c8c6f564b3843f7a7331e046306b7f528fb624 -size 1401381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3021a5aaf0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eac8be0e44d52e97470d1f88dcb68bf02698837a0af4e2625a08e0f68f2f6a12 -size 1378501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 91eb5a9ce2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f33f05d74985fce29c43f5d4d7302a80a304911d2d64d3d467a1456874feef77 -size 1258317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index da11e1ffc2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b9032204e96b80d0528212e912f3fc0e356aae071232e24d23d48b72f2912654 -size 1374109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5e74a6d31c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e08a2523696e92645e72943a558cb0589341045b2c3458f8a89b63e6402b099d -size 1329307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index a81c155ca0..cab406bb67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -24,8 +24,96 @@ namespace kernels { // clang-format off -#define TLLM_GEN_VERSION "0216f1b" -#ifndef EXCLUDE_SM_100f +#define TLLM_GEN_VERSION "c930bf32" +#ifndef EXCLUDE_SM_100 +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -336,6 +424,16 @@ extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512P extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; @@ -348,6 +446,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCta extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -356,6 +456,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCta extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -364,6 +466,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunked extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -372,14 +476,28 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunked extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -388,6 +506,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCta extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -396,6 +516,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -404,10 +526,22 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; @@ -420,6 +554,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtas extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -428,6 +564,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtas extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -436,6 +574,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedC extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -444,10 +584,28 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedC extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; @@ -892,22 +1050,16 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausa extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; @@ -920,6 +1072,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvC extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -928,6 +1082,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvC extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -936,6 +1092,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCaus extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -944,14 +1102,28 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCaus extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -960,6 +1132,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvC extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -968,6 +1142,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -976,10 +1152,22 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; @@ -992,6 +1180,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCg extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1000,6 +1190,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCg extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1008,6 +1200,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausa extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1016,6 +1210,8 @@ extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausa extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1186,25 +1382,97 @@ extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausa extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -#endif // EXCLUDE_SM_100f +#endif // EXCLUDE_SM_100 -#ifndef EXCLUDE_SM_100f +#ifndef EXCLUDE_SM_100 +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -1515,6 +1783,16 @@ extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512Pa extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; @@ -1527,6 +1805,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtas extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1535,6 +1815,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtas extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1543,6 +1825,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedC extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1551,14 +1835,28 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedC extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1567,6 +1865,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtas extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1575,6 +1875,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedC extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1583,10 +1885,22 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedC extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; @@ -1599,6 +1913,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasK extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1607,6 +1923,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasK extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1615,6 +1933,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCa extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -1623,10 +1943,28 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCa extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; @@ -2071,22 +2409,16 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausal extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; @@ -2099,6 +2431,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCg extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2107,6 +2441,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCg extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2115,6 +2451,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausa extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2123,14 +2461,28 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausa extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2139,6 +2491,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCg extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2147,6 +2501,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausa extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2155,10 +2511,22 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausa extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; @@ -2171,6 +2539,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCga extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2179,6 +2549,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCga extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2187,6 +2559,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausal extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2195,6 +2569,8 @@ extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausal extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2365,23 +2741,7 @@ extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausal extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -#endif // EXCLUDE_SM_100f +#endif // EXCLUDE_SM_100 struct TllmGenFmhaKernelMetaInfo @@ -2412,1187 +2772,1368 @@ struct TllmGenFmhaKernelMetaInfo bool mGroupsHeadsQ; bool mReuseSmemKForV; bool m2CtaMma; + const char* sha256; }; static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { -#ifndef EXCLUDE_SM_100f -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225960, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114896, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225960, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114896, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 225936, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114640, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 114640, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 115856, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 92816, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 81552, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 119504, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 127184, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 119504, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217768, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126672, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217768, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 217744, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118736, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 126416, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 118736, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181392, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 105616, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 93328, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166544, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 91792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 85648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216192, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217088, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167056, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159376, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44176, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 86160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82064, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 117904, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 117904, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 118800, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 117904, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 117904, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184336, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158352, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 94224, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 118992, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 115408, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 87184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 84112, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 82576, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 80528, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167088, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200720, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 167984, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156816, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146064, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199808, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200704, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150160, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85136, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 120912, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 86048, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 102416, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 86032, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 115920, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 110288, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86032, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 85136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 81040, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 77456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 75408, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 199856, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false}, -#endif // EXCLUDE_SM_100f +#ifndef EXCLUDE_SM_100 +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "13fa30196df3f1912cf672a79ebc80a5128106e8cb718d26d38067a5a37accb1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "3f8d2acf87227c0bbf6f15f54f469f14ca6e8adac3b8f3f95ab99ae5255018f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "2201d2f283f80a8997fad399651245881d1cac87ae442a269d908ad58360761b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "74c7ec8073a6ebbce7bf07a23e418d85906de07bb39fe928ebf8ea447ccbd100"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "af617d34ccf8ecbedf7fcc7eb35a188c03d1f1dfadd235f47aaf1f4c73649ced"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "ceedade9012ce29461a9e1a7df86c3fbae4bebe8eae32b6176cf37d9517bda3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "f90ecbb5e199c4f31d36d7bed29d3d0c5b08ab3bc1cd5572975a3c23bd62aee8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "06e8e674e1d47f67731c0fd859e9ce7c095119aed01cd91dc36a96bc223a0fe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "e0e7161b9f0659553a2dbfb9232fb9f99c7c2132003cc051c351d0e71ec5b500"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "0261d522974608dc8041eaecd433bc4875f5e325f8d83745894b6862a8d2c69a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 172272, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "56ed2c8f00b68eea494bfef75dee6f9054aff9b06d801683a6a573107790a37c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "5f97ac26f4c8d318b66ce48e1b5a4a9b9f6f48df1c14c6ba97501ebf1d402b5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "356999546de1c03083b44d061ed29044ef54fd8b24971ec71d2e423a4f9266cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "063ade6ecaa4366a05f269a5aab8fdec3463a5cff9a0abd5adaca6c53d80eb1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "6afb3b1a411c376e203f7210b3fc43aa875057ee108cb5c771ffba00a71199b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "2cc3ffe5e63ee58f864a7f343c8e8f8c864741efb9941b3c52fa863d6869b344"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "c80cbdf471e72aba1dd1fa10aa8c0e51ce1c8ea5b1c4718811ccaeb93609baed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "5e4d11816d4826c4e0d5e0d8183e18610fe551d9365518e1d5c9782022fa66a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "3bddb9d99af3f0da04aeae6ea9fd07d0bd38c10685329f42f5b4da3adf838357"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c89b574827e476f2476f66e6fcb7782d2e49b701caaf3eb7b678f9ee75d71b2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 172272, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "b2383fa9f593c621cdabe3f1743248a04a01dfe9f22a7fa09d5cb8289d42ee48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "6766146691458ef10b700ee2d1f6a9d9c98184992b9f6a2c372eb002a4d59370"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "5aafb7c7de5a729039adb394d7050ae055f741f2880799edc0c74503adda88a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "aef3e6601b9b0b1b5fa6d0adb692efdc3609819bdf49f1b9d6f5a0db14af425b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "8b3a00ee029831252d3aa628d274942f01cd0ad8ff7a8d523437e05fe4d25d04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "c9571dd28ebb43daa1fcfd658908a262d4656e2e8e0760d1e13faab980791f6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "0fcbeb4039c2cbd3735a0fe8e24d77f0f35c94fbd4873453d218ae01aacb13be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "c2c19de23f4ae040197d295a2d4fd6d1ce01c90047d264bc3980f016630cab97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "85fce1c2ecbe7cc3bccdcdc48d8a5b9b6750db3dd05505ecfaf11f57b314f047"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 131088, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "808b1db3ff47c07f3d02aada9b8522fa6fe58d89ed81c60789c55a4db29fee75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 172272, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "1c0337f84ee2b3a03ae65ad6c966fb825cd64376372dcc753ae8a8be68eb31e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "18fc60d129ba2048feebe32effdeb8f3cc1b922796dec3f47082517bf8e1de01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "9dbb45e688a4a011a842422e0d1b5219cd5e5c2b561d4049dac69740be64eee1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "49eb8c8341c493b35245dd12e87efadcd61026059336175bd26df065931e7fd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 201936, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1e7fea1a54e4357dec2355537414aa0046de2dd5fa2534d550a491de83344adb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 198352, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "0e9704d6f14d95f86d08ba58a37e376952d0cac9c547a81a7729ba8ecd18318e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "46a5a3a05c5b2e816470c11fb2be3af5e47074e470372fe61f4d7badbe1db548"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "0b09333a70633b4e1d98fae21f49f360b873e5ad768bc574f1dacb4a780e1adf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 131184, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "f95d63fac761042dc36d5268b1c7ee5f04ea1fa5b04607ec77f6908adf28d262"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 131088, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "15c7b45463ec44be2de95a5af5bd947edaec03c1edcfc23d34885d26470fd4b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 172272, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "9eda2ff9029a1abfae230fd2f517476d9f86196f0634566005005527aa4e0020"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 168080, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "d921bb0569f55bf02e8707975a0ec03a5fc19ac5da280147187388ea3393a532"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "d40142452d9559f7b42029a65c4e603e88fe99583dd19eb0dee361aef6efc86d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 164496, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "12cb9c391be6fa3a469b8b6dc29b70709da82044cc3ae0ea026ed768bca0bc75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "83840f1ec01eb0c945b89ac9cff04b7c729d8d992ed2129819c1f567b7166941"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "01b13ba9cea207e044ac67b412065900bc0cf853fcc03bf27dd54d923fc31139"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "67d5f5d732ce91afd35117de003c4c60b570c70102df0a5bb2ab80f2694d8f97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "0c7e06a7c5a90437ee1d27e9d3e044c6db76df7de7be330500da5a92f9216aea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "218a93c9c215fa668e6546be6ac4d4dd3db9ef9f270bb9026faa9331653ea81c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "178b1544d68595c232fa5f5a53bd9795be9e12ad81040c543df51019d80ad493"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "2df608f6a523e0c979fa34358db29be262dccae9fd7e5a2aa6eaa168cdff6e08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "754bcea9e40446047a0fe560505c28d1d44e1a32ee6feeb687375797615fb7ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "26042d91b99b5b42d305b0fe9516abe7f8553605520bd82c8a10b1e73d97c68a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "ff33636f961b56546e59d6fecdc19858c44d6877ebafe57e2bf81cfb8c5fb226"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 129264, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "4ab85ace8ff3777805def58e3004679b491ae5959c083d25982ddd10b20c007a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "7cef4a03577d8f15f5f236daaf4a8251ee130df0a8956a75777c1a3abfab53c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 125168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "ad66d9d3afb3438d862bf5881572d76792bd0650151f24866baac246d61ffe02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "cdb91c6d01aaf0a69d2b472a87363dd20c67a7571476ff25faf0c79894d72a52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "36c8cf60c7f473fb302331a13b57362d36098f52a66c27cfce14694999343271"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "b451368f5055cd02d1a9c98e402f038d6adb3728dd14e3c26b9deab9b14bfb66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "cb51a55abfc0578438a407deceb0252e7540e83312cdc7675a0648a208721172"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "7b8a4d109b0294b3d6307a87aa749ee83f06792b4643e376b852c50c03392d3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "f994326299feeb74312dffe3ad19e065375415da700eb2964a877e01b016f6ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "d02d4a59bf006bfc09079426d0eced46215a31f21f27aa414344453be5734203"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 129264, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "e743adcf0b0af1ee169dee4aef8de68c1a791b5e0281fec36eb4fbdf1d77bcaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "1c7c5e39e0370aa6ee11689a4f4f8c4470cf89d36f082a61f13ff3c3c82ba967"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 125168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "9f181f7077e4f6bf860b6bbeaf96d636ed816736b861bf5a5897be551887d7ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "58eef6e06fecdd7036b52b807848902c1099c01f160d38855faefe70f01665cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "6de75b87eda747145cc3b21b0a13009908cf8349a01cf54078fd1b84104c9c64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "ebd7d87c91eb0dfd4767075978513c550911752055364656e20c2fd9ce1184dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "9f9910014f89443b16bf9d0b9f725ec021a69a9350dc19852c2d5a83d8c81d87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "5d6fd066c7cc776055b30ebd5696315df739db7fbd2db5e5402042105d90a9c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "49289a3ab5afbb59713d3d0e88a16230dad89665cf3ba5eb97efb461712d8cf9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 67600, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "df411b625080c405f9f4aed715ab04c9b162a6b028dc8286ed4ae011efef5d2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 129264, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "4e7cae4b4db0415e6c78c2fa806336be22b01bb9b7b1707aa4ca659cc3c3bb6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "ff28fc5ff298b44515297a25417af976e6795bfa9cd9f36a1595e4ea40286f77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 125168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "39ead8301491bbe9d1f690854f67e953759bc9f77d2b6168ebec2724444ea200"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "2fcd4421cebdb06c6ca9a6f0880b73838fa1e9582fe05cd526ce18519e085d35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 162000, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "2897e5e7618d06504cfeeba0ddcb9308b3bf2c31d74cdef4d17ffad6ce087513"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 158416, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "bfb477a96df84a43facc0ea69872a7114fea8af61a3c6ff922e4507e6b612f8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "3ec4682094e18798ab3e3fafd01ce2c1c7de59ef5ee6f3d8bab6595d04c3dfb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "b44d6bbca692b7b319898305df0d539dbd8360456e20d12f8e0bb25c494bcbfb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 67696, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "8740f4a04a2aa9043843ce2874f7ddbe3be0959ce6fd1923a6c0f42061eb58c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 67600, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "f301fb6510aa8a093d51b92c1ccbdcfb7007af15b4c9e3f96067e05000913e51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 129264, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "d7cf1fdf8ceb050e75fcff9341796c5f06eac221c2b7750a3dbe8956563f00d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 127120, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "56b253aad7806d36195c6c9d077b268d690239845a3bf7e55a6e13032ea55ca5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 125168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "ca80622faa23006f306ea408be886a44b6aba74f21d1c3847a3ea69e1eb0d03a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 123536, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "453eebe179ad4af7a1e0af07693949388e7193170754bb3f62c55fa3723fc00b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "49a413561f195a319e11be2856a3c7059b82ecf27cf221bc63b802f447f3f8f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "de94647ae578624583bf62c938cc8c6d78425dfe5867f120b98ce1d3ae0b8b55"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "64ce5b8316aeff0d38644327b1fe14370c3fb2afef2214c4f1adeeeb07839fc9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "5b5b4122a0801295b7206a94b00ce93ded4547c1c40775f55fb3deda83e95de2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "57efdfd3dee6194970a9005a753e559d59b5362c023d7153c477bd77df13e497"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "1e41d713af3acb7cc3abe799ae178d689f8f60d3099d8ce73c49912f4a83157b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "7f5f73ff02d662b129dd60408be6bac7c5444ab4cfd093eabb16d72bec6559e4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "a0d56cedb4ce3930e954f6e5f8b7acda8487d1a5e3f3cc72394b517a0ed4262d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "815611f7e708cebdbefc0cccd202ba87909f320a645f74b3f3fb4201d82f59c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "0102b0c4293b2beb1315f1c33fa6db80ef426c00ba7f0c77487ecbf1ee862726"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "414864c2d212c4fd43a686a00e4139f590053c3e60b233bdd07f8c4f2762de45"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "93ff4b0785a8a26b7d47ac9c7e0a75bc6c3d798ec1103405a3e96b8ea04bfd57"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200816, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "8b4567d648953d4df0d157681e9ed6a0ca07e38cd6b4980b88be347fd0890ae3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "2f9075e6ef9e98cd27247498c454277a111cebb91546e4cf6c5e75a9480cd0bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "5e92c93c5c183adfd8706f20c36dfc93dd3fafa4f6e5b1d0f3470f712b40cea8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "8271a90deb112351dac6ef4e727621c4ba7641f25545489d0ad9d9a84a1e1523"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200816, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "a5582a116a2f5d1266f6bcaec9acec3ad9dd5d73c8015654a4bea0fb9b9456dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "69e7e834246a22e487d0c74d16e12455eac9d1393151bed5b91ad4ec5716c644"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "994001c22c934562e508d55330810b75cf53ec964bd9c009c79930f438ce156b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f77da750a7f6b7737270da131296970c435954e29db80f2a82eebc9a99cd056f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "a16414d982ea4f4f7c803676e4fc24f31352bf7c580935a4c299424f759a47c7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "4d6033a76c7903f3187b291579ffcf4a0d4248b06fd8ba5c104b9fcf26f2bd32"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "3fe09eef90bce45b8b09df1df30ca77d551a660fc1494fb9c794d5cf35f2bd27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "92e156d625553e22618fd5fae89f596b6efd95b83f23e033f0b3658f2546fec9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "dc5ed4a0b834e4544675c6ac2d8fd3d63d7afca3701c166154b537ae67d0a9f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "f8b20e884b9c81f94b24f670d1365ba4c6ea289518467708615e133fffeaabad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "2852c47e96401c05852e0ee8777af5456e30db4d23e4d99e4e16125926ac947c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "d560c0777d86978d4a4e77f158ec8433d6da8e0577aa68a9f40996290676e6a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "e8212efe2ca34222762d09f042738227fab916b5f2878731921a40c115a73798"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "cd60adf1460384491f6476b287c3b8fdee34b424dfbdbde59a27402cc0fd80c0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "a99a04073765c721e0afd51d9033e0a990ec1b4f5ca816d563453e9397e21699"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "ab04ac6f2925d8da2ddb3d2ae898f942dbb230d9864bada4670c4934d8a57be9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "2c87b9eee0b51ba8e95f4500839ca5c38eb1c3be4d677d71f56985f207f69877"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "2404cad908c7018bb58519302f0cdc4bf2da833872ff942b0b3c6b8727aaabd6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "986ca7687ae2163c3b99b5f7131ff06edb9eda61d185b17675d4bd66e6d5a68e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "63f9542dfa4094c158a79b2e6ee108537ded8ce60747226902562ff19bef0dc2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "bb7090047dbb820346ee83d5fb4e99711716a0956e0c0dd69be62eb2bd5b5981"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "48fbd303aeec16bffc1e5a33ff392f78b25ebd215b800c2fe1941d3696f9863f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "a7638b2c9ee643ed7ba309cfe9e8f69c0396938cf25119656281914c68b43fdb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "bb4209356b65f397deb9e3b535d5b1e984bae70316a8d05ec0d0431cb4d78dc4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "1389abac083110d1baca163faa153532c44d97a28e7ddee1529406e4a9d47182"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "2455d1a4a1d943b2024e382e12ae6dce19417f7eb0dfd468d325711880aaff78"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "1e9c53065fc061e208cd46952289158d520d54a5277e2916b758e210a8b7f880"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "178e344e5211beb16d640f7279607b74b3d661efc84b8d840c592070f0d74478"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "ef02a80397808d3aa66af8ce630b4fa65febe4554b4d7b0cdeb336dd0c0bdfe8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "864ca082ac49aede22f9929e2c136eb68a2c5248d2323e85894c30ea5065161e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "20c467cc781e28030e03febd7269631e829f04b34e2ad4198dad18c208a9433d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "4c566635b82bbc544bda4e2a0655b3a17fd7855eb3e2d2ae35c2220877a9025e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "7f65b87abce03c33693cb5ab595759ae90c6978764e6cbb7c29ccf71c666a119"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "f2e36104b1c71882ea9642cdb3697f68e462e3612b07462f10e85e1c591614be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "990233d078b1765da80e4070b469411fd63ad8121b894027ccbe12225b9d7853"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "71faa6ed67f087401b2a664cefecff38d7def63bfc588bff6cf3779be102d711"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "ff263bd06429428c9c46bf18940ac9bae1fec3c6e6f7021395fa5caf9bb2e249"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "caadb791f0f48d3671797b73113eaf850ab7e49df476bdb521e207a714be2481"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "65d28a2eb4b448e61b9988357988ce8b14d9ede8264a31cfdb476f6ef049d1e9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "0d960408f6380a84085eb654155dddc2a114d6599b0c05fb5924e2fdec773a63"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "9a2034f28a942c87abe755d803c26bd9058865c01e96bb545573cb9985a76a56"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "b5b9cdedbdf55ce7b483456e73537a1df195eac6dc8f87cc02e1d46320285d27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199904, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "5b827d6188c5e9fd1081e3918c4a4ce5a32a39ebde04f4772ffb159947d8a39a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "42593d23155d5d6d611a237759e3544e11fb99cef1d7677eaa0cf3eea094822d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199904, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "7c59e9c9fc8cfe69ad7dbe44eda908b79402669e433a3e4a111887080676e5ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "abb9895bcbeaf8529e86bdbcf3954fadcd3979ba290d243d7f19cd5726a29f7c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199904, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "70fd6abebd4bd6cddc151d7c08646620a8aad13f2330230153ef1d5d81e3e3c8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "9e87db6f95e30f1dd8ae1c7e15952f2bfe2f5637c4d8e46ee7305e7e0f0399df"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "5257bd0b8091e4b1e832cfcecc6e215b914dc3008c400e58513114b77631d5c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "7b94e9a401e8a1ebd25ffeb79dadb3482e3f611c73f821366622cbff4d82cc19"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "01850a97749d371d27c445f61764a2d0fdf028ec241c01c8c44cce130a2d0ef0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "6db3adb921be20ef061be2482b929cba39dc3a53f55f4c245d9d58416eb79972"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "87eb03705200668c15ea32f964846f1a4765d57f0c21d91e7383f3dcee2549ff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "daecd024e68067da7f3622dd1f4becf0f78aea728bb795077f5cf06be727d7ac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "65725cb7001f12ae047b318f2d60a5aae4cea8008941585e4c4b77746f4597a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "7c25664d47a0bd466aeeb41e1afb09ba2551ed97d3e2be2d74210f6accd9926b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "9d5e0e07b0c7ec0ad334006f82d235480174281b86b4cdb0d3c07531cae8b058"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "f0226d853b50613644fca9e3af5b2c9e295253195dc7c2313552dd17ece773b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "7c827bdf11efb7a527d7afb482a25d54cef51cf532d469267ad7046be7dd719e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "7e4bdea612b3cc92dc7b1d1c9a4e113220afc5dcc8621b60fb33458bb485568d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "ab45b3cd0ef65c0a8ad5d59578a6c130ace5711aba62ee4c7c6e425daf2f099f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "de43a97a59cf906e0f1c0f85e96c33ef67aa4206cfa22eab3dc468f354ed22d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "ae37a6f847b4725be089215763babd285a0b42890e650986c74d31502759737c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "fdb643c32df098634540edddd90720cf341c50f940bf7adb1407888436b8b4e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "1896d438229b7def8bc627211acadfc7d4fa862d20afd83ee4faebd5f3370377"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "cf3573e56e86b56e7f7bb92f07c3d1cb07b13390516eef1d74200457c558ab31"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "7048ed2a384fbec5585ec03d01759b1b5da2d4eafb7e1d08bdd79b91f2f8a3b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "b555ecc457a820cf3b773d1245cbd72dad425dbfec15917aa84616f30ea35f46"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "82dafeb9d43c4b42f2d050645f4d50e6814bc477066303a744bbdc57d439adee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "37e2cc71f64da73a9df2037dbac9768c7f55c9bd81a80aa94618ae64d6f17b49"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "7e3282271059451f6e8adee150e050ebcd708cbd8a96e7079d6657c2af5028f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4ebe505d63529a770f2975c751918cedad10ef731188b311d9b0422b087a0bb7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "a5f2d9c1713c5d72c4f530e8d3941342527dacac9083fed0e29650c862d02408"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "a9f7bae82462a9b165b6126831a36c872da619fc01d64dcad2114238d3cd1228"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "8a168f850f5d4e4f68c454d2431abf6eea9533a6cab1c49a7782e0f56080f131"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "a1662e3198b0d10b41e5362a643bba55ff542384cbae56bb5316ac772304a550"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "267ea4e9f5ab581a7a12083d220b5b3bf2ad31ddf62052692a9b3ad5d9da1a31"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "e064a56abec1d5043f67787014c2b5981bf34695eef729a9eabb7c7864c373a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "8641ff2bc4232b9480f8edcbfab41fbb02355e3fe9fc6648835ad0ca08819ecc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "57c72a3135f0ff30957f38450e3e4adf2c1bc5cb21950e0eea10f6817523675e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "e8cc172e1029a7d5c32c3bf509ef84d0575fa6537c783814d77b715fd3e04dbf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "27c6e1f8bd5b026083eac9ff722ff37cab4052633475c0209d45f281cd75d0db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "8535f6be82fd74eb008eee6254babf372e2e4c6a5b85f35d4cfa5dd7b674f616"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "3d43f428c857d494867b97332d6509207ec2550e594c4cd2560bb3e465ece9c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "5dd80c0cb7d49086bc803156b0edcf9b936e7ccee29c096ae8fcd744e70fda07"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "39f35bd0fed1e5b05b1a4ca0d775038d0cbcbb27ea876d041543410fb9610830"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "669f9fa45e1095a8838697cdf9a9281693ff52022b9f035dadbb1e197d1ce550"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "f1bf6698cd9551d1c21f51c4bbfee62f02fc27b5bdf20ba10896d65bcabf6a77"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "865237a7cca7d0b9c35e7be5f6526b012a35be6ff25fd547e92444d57aebe959"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "16b6cffc972f0c7e577eb8f074cddfd9b91517bd62a423de6fd3d81c3f1a96fe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "495255aaad0b9cd737c05a33bd0b07475a55949ea7e128cc69091e46edc510b6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "bc231e2e6a4684d7f5acd6a526f82d13aac4c8931488c87d16a75313a1d5f824"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "963ba2586473a02d71dcdd42442844654d959bdb46e41fed1d63b45a9c5aaf87"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "1b89866a97e49de3166f6190cdc1ec8b21aaece4057c331a16b032363663679d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "70b6011baada436f24fc3f8c1153c52b99e2905a89b179ba5324d54045e5ef23"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "4d8871791841841ae1fb4eeb50b234146f33227f053ea74b85555a03ca91dd27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "37ca3251c638f827d10aeb4f02549f4f53db13e9a73ff04ceb59b226eeba45b1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "35fbe070b3b17a276cb1d3aefad9a0548f25891efa3fa9baf869413aeb94ceac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "169d8a00ed3b6746df4fa3a4d3bf3723f854e81e1426023d0f1155e6425f9bf1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "1731f1182e05b8d3af4622343c3e785fbc8af400320ec87b29b81b64cbbc865f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "66a2749962764c84eeb315a0e010951240d613192bcccf7e56c5b7b0309cbcf6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "606d930b565cb53830bf4e9fcc92aa961869f68a57e17db063cd3f119275f106"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "59882b79b11689030c1d87a1b91c9c4c6db3dd0741969f0f57444f4f611cc141"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "15e480747d8c8f496a388d31af72ef87a258b560add38264598d871258ae5043"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "889b17b25739d4ebe758ec342365bcff0b018f3574e59c5560b6b68cf4277b92"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "7486e9cb90defccc05257e76a26741d159e969a26a26c3940c8e711d24107425"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "84c4e9028fcd93e8d1b941d7795751c59675096ab901b6e55e0949ff8e9a2ce1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "166ad548a7111c474bbec0b98e2875b81a630de4fbd51e15ab1083b76114911a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "bec3bbb6ab551647d6e41d0452a643cf716cf9139a5ecd72a6467511973bfe1a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "ad9ab0d9f5dc6ccab2c44965d404740c84b1ff8765870dd39bfb7db9e20532b3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "0d677603d93d7fc959c90b543ada287d8c2295296f9a8c30e1be514a814d0f9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f3bafcb064041befa824f1213994559d6d6a62ec29a1b450d84f4ee70fa65cb0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "5bae8e8af6ad4539c30f1e527f6809c2e537185a57c55ada98db0d6a2241d728"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "9418553445e16b1644e2d6c8a132d5733bce2b22c52fe1ed26f9858ba531dcd8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "17a5eca8617e31b795eaebc78ca9f1784ec8c5247e54f70e414b4d41f6414b75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "a8315b876dae4a1e9a7e5fc25bedec5f805948f7823c2cb39745bfd64ac36225"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "b43a33b093c2641ccfe23ca949fabd6b3a16b0990538f8f139f21c202312dc77"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "9b1f679f8c51e1e43a5f3eeb9bfed94babfb92ba7b0836105577b593b52ccd16"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "6e260eca98cd4f85f9a28a3fefc515150deb5e9003a44769d799dcce2d4c3bc4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e4aaaa7a85298f72f1d29abb6bff6418c92071fdd77966d705ef71be338dd853"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "104c09f214d633b2763380e96b267b12fc9d826676e4813af76bbe0592bbdad5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "cba963584baa243dce364353129fc592d40c422dfdb622e1dfff50996df8f5f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "43d1425c0c0bc423785b7025f02fdf9c05cea3cbe65a7787c4f6f7d95f78600b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "483be0819c4b70fc7fc60fa34368f5efeacc19a7ff1cad273500a25c7e52587a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "f07fd28f13813e39f7d500521bd17506a40204e66b9f6cc10cd85266dc4226a0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "2bae5df62558af5c11e9048ee257edd31f4ce17627379d594d66ce7a4fdf15f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "e95af640c144f550497c029b7556b3aedbb9a8d88fd85a91f747c464ba1cd21d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "d69823a4d16143b7a2d64b8d2a21b29e90bbb95b6eeef0f03aa2a7c8985f5508"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "5b3c6c5dd4d43a68cdf22bfdfc1b64008dd144ffbb97d9be8361554e36608075"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "caa13c7bf283c83aef49950380bf614ce5f2567db768b49d3b3ea38d2b53b8ce"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "a2d5fc0445820d77c0047d9739d0ef17870ff4bcf4bb960004dacb82145f2f5b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "a647a052b746d83918a73cf856cb03322ee95dd55ecab4587b4c64ee714b09f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "66309b7c5ae11b0b9b4bf7fc2587215af6cbf2a8bcd53d8d977a2d3b5e149a0f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "759e1555a26e4caf4bc35f7be60aba75df566c71d1b2c2257f393955925e7d8a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "c69df199c6e5dd69b3ccaccda07cb5263d0d013536312729d60586b59e26de4f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "964373f4b8ea5db53cecf0e04559747cdafaad6e9f24f298ed197673b8de033d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "222bb264564938ae2184b6238c03af58302b0d77f7cbc27eb3f204c8a3eac8f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "cce7cd65fc53176071420496cfb15fa58e527724a4475eeb552f15b670c64550"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "82873a554dfa08754bcf2b9ac9a1963e5bd62934a29e42c84eecb3d818b36fad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "4704b51b2605958800c5dccc5dcf56c1b2045d0ee7ef9e95d6bc01518f4cdb82"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1ba5f7d495fd1c01ff49d121c24224eb8cdd892cc9d12469a300244f68a24114"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "16e860b7f9a1892f6a3fa47d8e2b27207b5c1d2f38ad912d057fb9edb610f388"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "4772bfe8a57638d311246c5ebe265bb98a13c6dec87a659873bb530ef3dc487a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "1d506c7c65e28a8282a6a62e168c354ddb71ce3052c4b15a821c7aaa21e65f5b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "e06f4465bd8a0eeff7884d1e0a7a8381d8b7f635e2f7d133af19a7b0dfa7ba8c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "c6504f2fd7b5811e90f00c3fd913ad9b8e92ce66bde66927d2899ed6ecba254f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "f28b77f240256edb6d3925379533914b2c609532124a29a92df2a432bbb48292"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "a15d1c550b247de5db205878417c62a0a88dcc969b5cef137210179de4c9100b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "5415e35326e532f8a19405d005a9bbac7f280a212fbeec1fa1362b0379621fc8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "e1d40154d3cb409d64f7785d0c5e55b99d8668e29be3a5786de262d0f873e4aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199952, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "008c0292d1e8be9457d280cdb1acb3bf70c2d627df9cf169fe124d86e987dfb6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "bc0464196eecde0d45de6af07d8e6968d804e2a8ff75b96b467b84bb36ed1382"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199952, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "55fbde32e19f4db8e2b30b7f36b8069afeee8efe18521c5978d8543115cdd49a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "e0b6386df07a47ea94812fc07b096f364b81de1c8412f7f8c6545c99db426576"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "1e72412b35ba07199c0bc84fd80fc278659c3bf9931bb4567c879296c6d67111"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "c0e377e8ff93390aced689c64247eaeaecb3a60913703cf1d522dec1e96c31ce"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "dd43b03dd3f2d12f8665cf4231cc6f63df91aec322686948d09b4f1fcc732500"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "d0e0a315ebb083f39b6bb545564e8d67575af37c822a0d6cf6ad2c0f3cf735a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "4c0df012eede2889fffcf768ff1e1cf5b2541b647e307f8428ec70477be2a8ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200752, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "22349e85c717630293909e488f64260eb4ed7b746100316233d84735053ee823"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200848, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "3b4baf1c1d7157b3ee3455599a8154c414d050e3b7615ef5050e944f64af172a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200752, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "a3c6858de4f3f1c8b0399a8a2da3819a4eb08eebcfbb2e260532f02d1bdb55ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 199952, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, "8d6e991d6d51bf38885edb0a2fd2579ae38b963194022e6812c49c3fd206bcd9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, "f36344195f8d48b67b8673df26526e0630fb32130ad70c0310cbca257c7eb1a7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 199952, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, "55800ce580f76751dd403483dc1d576c64a54f065bf9c2f68dfbf9a31fe0c691"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 199856, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, "d6c608e70750de7767165b2eabba31c34f579720f99aaf8b458508f8b123c7d9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "e33618656a6165f54c5adfe74c48886a7e1794d264cf98f2c85cfab406672cfc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 209104, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "058f8f982a229839d801a81bfc371c91a5a3ba198302d149c0d2532807710ccc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "bcb3681741e25091282ff06651f3a6ca34266fec670fc7ccb85c96be498c018a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "08fcd80644cb1bce6fb6246179ed27d6055c264606ab968e04669e2308b25641"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 197328, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "439bc3ecd47f45f13b310bb0d79f8dd517087de0c544835da0a025a13c9a3bc4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "60a00ba6fe751d81aa30345f08045fd84e6ff40b8e2c7b3584bbc28764b24c44"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "232e26569954f2b79483b8e8d4322134175e7353cc1eba3eb722b78d688a190b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "b9242b2bd12905742f5232a15e9a640a79880292ba99d59e27d2aeebdb92410e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "726294f4109e3839249d5e19fc701052fa0d5ccd142ff4d5d123f153be600463"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "0ff4d638231f91855995f250401f8c878d3b29aa9d5dea14b4e3aad3337341b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185584, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "e4d7d9229e8c8165df6e4c5c53ddf8f9a1b9e1655af5605baf869451e889f936"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "7c4c0e0fea700a42c510cc349385c00f3c07e0835b0efa687c86a7ff2ed4d1e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 197872, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "3431b2296ace6226f02382521e9bf887704b082329006a8a168422e4265a8005"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "1444c28595af0fef1a55f658090617374d6df04043dfd8279b04410e5caea20d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208992, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "12c73b2586ad1a52e5eeb89ffc21c7ced7756fe545ce143a96f3653cbc18057d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "a7e0e8205896f1d856c4ae0c47f298586e7aa4589ae7374439e7d291222f2ce8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "c126571eb03ddb6d6fbac8229d874d579470a7df105aef301959c4e202d77b2c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "f7612db920e45290f1c13cd2f600726b14bdc506eed15737f72c4d021d110d7a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 174832, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "1017d2c81a20ecfc48c01f81ad99e08fe3c9981312e9b689753cf70bbdb82115"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "2bb3fcfc7b0dc5f770c59d789fc9fc54656db69b19bc297401c2051df8a0ece1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196816, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "5b0c0c9fbe49510fdf33263941b511add53602cf1e3aa78f286f24671cc3f87d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 209104, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "22d5181dfc06bdea9c4441ad5f19c6289d83d29763c38f0c5cbea0dee5b13d18"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "3783839a9b2a4faafa5e10bb7853df8cf0cfd1e58226b3e39fb596ca6a691c89"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182992, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "e0d8d5e89e1bb32cc884819a8e91ef8f24a7f6d301973cafcae38100095c52a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 197328, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "cac0019be46574decc23881932573affd2db9b49988d132628487f5bc3607578"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "1780e70f26c3e83e0742bd67deb6d48dcc9424a8a3ff7f3025196e9ed0de6268"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "f933727e446bda1bc5ba8d38e020931f2ea2616b47e7f43cd31bb91294df2666"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "72582abff51ab56d1642f654f14fc37e14333a331191609e54d6a90d0c33dd1b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "94c13badce7d5cef6dd5a0dd5f2c2606e2b3969ad85e06a09ad1d0d3535c4141"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "512bacf9534eb6eaac1b4024c02a6c4d88bfa8d8e97d4692d00d95e47e18a774"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185584, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "2b45fb3a267bf8fa42838fba45ee9da010c1a6a05b72c2580b38cdb768564072"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "f4598bb968c7616eb9bcd4691efae26a00fd932c4beb2a3a09220d8429dd70f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 197872, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "2fd4ad1de2ac6314dbd73de6e1a8efba0484362a80a4c4dace00f42757a3602e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "13b626297145f211425ee78e50a334b08eed003e4bfa6c16276b40915c5c7555"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208992, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "10adef7e68e689b390baec8ba0b234148e54806f6e884ccafb7897666d23226e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "af47137af9d169994343f9fef3e7a79b78da306614b7e9264b3c82031a5327e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "3833a85774adad816e6168fd1940ceb5ce60681ec24919e4bda94ca189e5be1b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "3a325a4e71a26e8fc155b54f3cf8ada5b2782a50f0900b1acc7ad64d1220281a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 174832, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "b2790d97fec3d2fb3c78c8b7b1cb86556e605ef2e588bf33e0ab0b2c4f1b7c42"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "c1dd76c49cc098b81dea345c5540ff0cffaa8e67e28f8d64c6f5c632c93de47d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "c915c452e85bd6796c5eb8f75fcb2d50870ebc434a4bae291ffb3a795188904e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208592, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "5ec7310c68b3a73190583515253bb2c8be21cb0cbe35483a9d09d8fe49db4e9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 226472, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, "4e8e8de80313370a0de8da8d891cefedb8304f995a917072759897b9baf8181e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "4e77a42a9737be7a69e9903fbe5d79c272e9b75628e65698bbf522bbb47ee6eb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "65c7f7e12266313dd1eb316ea685b8026dddc7c1b65d0059d8666f83fdb6ec7d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196816, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "3324f7a3b84455e8ee8efa79f2bc03ca52b81c0be904a3aecf3968d80c8ed744"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "5332ca1f720d9d3b61844617fdb8334f05b3e529bd28db74b44d827dd0c8ce74"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "89458c4147ab8208e1e3db4e4546512a65212ba30528b63beb6de08f2a03467c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 226448, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true, "45550a6051de03fd9b87537cba6df64b32143cbf17d476e149342a84188957dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "b59061f5ef5b9333fefe4fa4085b9eaa2037bd6d3406cc73e4f7017b36847f73"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "0d048aeec2684fad668c8110072a6b09f80caa3e9183dddd13f6acfd101b2190"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "56195fda7ab7175f4df799bb6f4cd09bdfdbb54da2f06eb2dc350b8c69f32af3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185584, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "d9cf20ed63af65ab1fdc262a211550550e554ecddcdcce5d55a26f01cf003811"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "d830bf66315e4bf9ef3e86c5eb3289c3fe3362e9e5c3073cc95dae0d48056d53"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 197872, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "eb742e10f718613b95889f9d6d62aea05293fc000825eadccaa0ecf95cb3d19e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "6110e2d5f2668c40cac4cabd422c973bde5123261f7772d93ed3c7736fb723e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 226544, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, "bf25a6f2f3c22de5abe46980f274193564ac79709508e7a6ff57218f7d94cf98"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208992, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "c73653a959c2fc0979de8c11bbaf1ab6f937957602e96a84c393b1e68db9e6f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 226448, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, "ebba03f2425f166a928588b86a92024aba3681a1e308f006e32fcb1ce64ae6f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "7a3cefb545b753325b7e3431a06fd216af82560177df8e473377720caf7c2ce6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "6d208dc5f089ea6782119462529626dac5450b71d39138607d382ce78313aed3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "7a9f2f8c736dac03cea5b08bc48fa924e988344676fcb9bf3cd1a1e9459eaf01"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 174832, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "9172500202c7dc957037aac78729b497323624b1b087fa5dd19285e480c1a8d8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "fdf87830a075fd137ccbaa5c24f707794c7159cc503b0cac2c371263ae68db25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196304, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "39d093e23f2fe5a1ce852a800b4cafc8243f4ae3e8afbe54a42e1ba1e006f519"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208592, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "0a61d0d65e3f01ff1721eb7938cd2efc72a6d278008ddcb46ff19965686d52dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 226472, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, "09cdaef606af16e56cb142fcedf58641369bcb57a5b2e70b5bcd5d52c6e38d2a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "9a32153fc325c6867ad5bf14ec3e7a1919d0db025d13e850bbb0057d4a1dc091"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "46a03fff0da2f9485e8854ae1f30009ff2073cc249a15500870547ffed470957"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196816, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "45f6d54be1cfa4bd8b30e24a90ff151af546afb071f32b305ebf9025aeda447b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "2d3a2b07d01d54724b4b401ef649f830aff2bb11973d14c47b2553ee749040b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "8cc398c13bfe4148ace1e57159369e5ab8066c69984a0533996134e1d0254ce0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 226448, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true, "2f6341989e52873df9a876b6c44f85d96294e221f656491cd18c84ba0142c498"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "718e235e707dfca1c9f15b22f0c8e646ed5a5547469a61c99f07676507af0499"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "3016ea6c352891fbf13cb51aba600a1cd35983fd4d494cf0e8e8c7aa7e7ce228"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "bab0cd48fa4bca85229b557674ab05c277e085f843641f8163ff02d0fce10914"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185584, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "ef56234f3eb745bf0fe9bc9f400ac18916227a92f8b18dcc3668d74e76c3d9a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "c570b98483f5eba8e94c9d8fd75ed1d6b93172e3ca75975fbc6192fa52a7f1f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 197872, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "3ff87322eb4c77d2f2f4d2dabd26648a432e565a7973be7864a53b66ef6ca2dd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "14b02f6b59b5bca90d4fc7da158a28e2e825d7799d0059fea55eae1617bb9d50"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 226544, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, "db03574f7f96516096972ae3ed16ef32529cd75d079bddaf9ebff920335c6c4a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208992, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "920921e16b317f978e2f48816a696b95e08ed951d0755d3173757c0a61ff869d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 226448, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, "634614425635ddf25bee2cd0534c5f7cb9b98b19a1d11188f37e953dcc36afb9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "dcd223933cf0a04faa51cb457eac1b509e0ad386e8e424d30086511ce748787b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "aa4669c56ccfbe2e23c7e3a58761e892b038676323ee9057b7320b971ae5150b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "2c0792cdde5db7828ed94f5226b0b023b26749d6af94a9608a84e079c677692e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 174832, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "411f32baa037b899b4d806bc0b5a371b796007e2ef842a51ec0ec8d2e3196c25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "343eecc8ea7217db4a6d821f9f44676aac9019dad4de6da6b87725777c3849b8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "e42a2d0769f401de5634d5abbf8fa18872b1d22da15f3422b32bd39105d73fd5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208336, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "58356a6fece13abd26f9fea7ad141c0787dd93c08c67538c0e1f770f333ba072"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "ed2d21cc4a6d68b044a3775ac7f7e22bd9836e7ec0f75aaf5a5db9f913dcf2a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f1fb3763b19a647c6cc00818173ef7a4dffd97d1d3a5051b6d8bc3b0707ad739"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196560, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "2f243110b80394d250989cdd224b4f33963c5c93d9f20bcf81efe888a22ff6f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "5820ebf9367a016b73cb7b2d3065bdca1506d078db8052700b10fecbda1d7db1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "0d0b75b4329a06acc969656f9a594f45c145a41a88817ba60a213c5f716f40b1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "1af50e2ae3483ab2ae7b992c5a2ce03bdd69a3b0202e30478a3bb1a7f24d1893"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "f5073b650fee4662b4e60098ee7a423796d4ad96b1d171ff31e57e53a92c2608"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "8eef28b7a5926e7296eeaebbd61564ae5fbad86f16a4b797abb6027e20e57cea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 185584, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "9449f1381c103abecac3b760189989241796bded2b0a207b00c796729ce34a4c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "8fbc68d8a37e38634e9a3bbe54659d807c3ea8f60dbe0ef15daaf7e1e521969e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 197872, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "07292c51d75183fe862393f4852e7f112c4e84e7576225e397637e79fa6f2ef7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "3714a028f5e68ec1bd4d001e33a71b26fafd437b06345291e526b733e10c94c5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 208992, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "d32fccf76946a1250fa5a7c610da32133b3b64d84dc59375fe4f3513f91d6d68"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "7ac902c84212574a787e339baacd76032a4721bac49f857dfb1f9c0dc9abbb01"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "26fffa7b5011f78bf735c30857244f6102f2124189e91f765da4282861293153"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "7fe0607b28fd7afcb765d90a337345011cf6a90fb9a38829375d949b3e63bb74"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 174832, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "f8a3521bc491b46c8a29726ea3d1d07c02a2e4745ed8dfdb9509801b66bad2b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "b0cf9972735143d894ba392ce16be8b21a0c448500b694af8037bb1fc0f33430"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196048, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "4e2901b2c3d4c71558825fedc0127984fc121da9233cf74716e25d4485907cd0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208336, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "730d990a3df89fc03742a529c0551ec4d620804d9f3e397559322ef9f9740d0b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 208936, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "cc17a53daee6fe3c6e0339839b4ed7b8ab23d00be9be18aadce238f8e0bd165e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 182224, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "afd5818d92e313edaa4aada0ff16745e02f08a2f8d778c568ecfb2806694e8b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 196560, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "fc391ed71667fcafb1534adee9fa6d629d6620f5ff43df67e042c6ae498b7a84"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "125dd2139dbe33edb36890d529f6ce287f2bbc0e637d213deb290fd5ea61d749"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "50ddda60af042bc44268b7d6eda4633be8a97a0373720572ee6d91cf7f76a7e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 208912, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "20607afffa71ab4a13eea60a0bbe633cf82158be4aa6bbebdf9537d7d6c78f72"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "b0ba738476664ed23be73702371b3a80fc9444f5c72867d4eefb7262de49f2ab"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "5511ce8ef6f9451c861ef5b9ab2cbd359e681a93ad39fe273188f546e595ab27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 185584, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "bbb490d46537bc03ebaa73033d85e9d4a52864ccc927aac00b041c11f393f596"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "3e4ee294bf29b86299bc2074ac9bf7d50df598255c884db61f1192db8b29f358"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 197872, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "cd041d553e3703032e3528db44f9f47cf630a63d0edd4cab71119925280ad657"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 175248, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "95fd5c6ec34576880e1e58b3abfbf2f94b90a24a4389291956c889b7f6744a40"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 208992, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "b78105a8835ec25256365ac25876e249a0f2a8ce8101dfbfe8059b2e9961af3a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 208896, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "58f180779b07594c82837a81233612c8c05d1d20e8ae6e6d0ce0224e5f59bbf5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "c22566b9d769ad19154dc7d0c16e6c9c13da810d4240401cbaef2a17387a09e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 149136, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4bdf24a9cf7fa0d64cfd98734c8b809ba9ed645fef8b4634347a04972c816455"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 174832, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "718b6bd1fe5d8f67731324f4a399267cfea094206149ebac2748f1f48c09adb3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 163472, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4d4ca03cd0a1f850b60399733ff2b76fe45269a6e2466314c560cca1a195cc82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "28b866092cb49adae74232bbc1aa6a3b713b8e1a279d83e7f44253d36b8e63a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "ce6cd148713e6f140c647d95c352a64324f29c8b8f1672a1c73612633034e7af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "82271891a7d2a7cebafe06b7aa0d5e7e54641877d7cae2fccc2c5e104f5d9a9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "4b6003595350341b30884114c358c0b9cd1e18a9bb45ccc5025f64f02dc9fc70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "78a363f3a0f8abd30be74ec1821162723447a6c9ffd73f841f19de98cccd5ec9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "8087384404d5e31900b3f5957273e7a4f9bd8351c084182351a5222b743b16e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "4359222e3a3d0e1ac56fd318ca831c331fa4f6d1c817d517ebb5fe7f6ed067e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "2721d05c729ba112cb8642f967709f337830950bf8207f63afdf388d23f711d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "ba58e04464580a08c32312fecca9b1ff1fbe77a6211e9c9109e6521ec9dbdc16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "f43e8dc20017595194469bb76031dd313fde7b515acd90c81f6827172c4be4ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "45654e9f59313e947fe6180e5903a82b856b943e53cd342528f48e861db06db4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "207cc9d4860767be6942eaae65aaafe38a3733f9ca03900d7326984bbf114725"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "6d1a51a019f491113295d8b1145ac02ddcdd15083181be52519e689e4205e3a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "81044aa760ec70aeb6b60e7118874701e0e349a4ca12652a8a9a1de3bf10b2a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "d46c11355b6b1771ee29c8734a9a233500e14b45ba0c1beb8c1c52d1e465fb21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "fb60ebc127bf1c092491a70c73e361fa76f7a5d6c9b9bcc381033859cf3992d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "fc26e852087ef02ec4afc62062061f66902176a4752781893192643c38a178bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "40444bb11b219320c459c5da0cc9980627e4530e1bdb5aa834f0cee87cfd1e5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "b3bc4d58b8ff8f5fb40c38c70761680f9ea3c299568f59492e16457cffc46c47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "3b96cf93245b3fc556a69aa774b8d145b460ca98fc2d35aba931923bc6254d76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "e91ad0af2cd17d260c572100a185e4c79d19437395e7dfdfd501832d428b761b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "418cc378ddfa12237fe2e9b59aff04959f00e397a66b6e0cbe1224e7620da37c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "7d6ebe37ef71f06b3583ebc4b54014951300db65050372d731210340e44780ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "646d3f0bfc3b1ed49ac8686a35e95fb261a391a1ef74afcbc588b548117e1a4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "fb79bba8cda5c3a09f95bb670d5220deb8d251ac0e2d566f2be86f2df3bfa144"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "42fce8fba5ae976961e5872d519eeb10d962c48e120e8b5e59257a139189f48f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "284cc1d6932a809ef5d4412969b71ef7b7bda67f0330ef2a4e6ce256252655e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "f2e9f924745b9d8476bae0e91f996a2f8ea8406c4bf2f79899d98449a6573fec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "3b6657d1f5081dc64cc450b584e5941a4b5c9ad3f85e8b2f8f84494344a72d1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "6e2645c8c0542ca7fb618519a48982b3e26fd69cdbcc4a645c8f95a848755ad9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "31a13aff428b23b1c9d4c3eadfdcaa2ea2ed67052ecfae4ec87453e5847aa177"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "0a01bc58735d89cb3c3fce8fbd104faeed7d1d80d39131a8dfa7b4d871190e06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "debd96754dbf9cbc4281e4a689bbee04cfa159096e6c7b70a1307639bfdfb982"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "12d7b393072352838dfbb2bd23b878576b07be8b6d2f26e6adf346dc9d743d89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "9c3c75ee9b7ec2a573fa7786868ba584bbe802794dc6b6367c93f7f18fc462a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "d70a48fab2c0c7183705f0594c08e3b21d71757295e8f2288bb39eb7c8cb09e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "a8f4d854d84aa7da0d174e77ae4c47e9e4f695864e05cf5c3ebb19c76f83894a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "afc285f22f95815827e8fc7a6c2f30acf797ae446ac25457232c6831f8c7c6e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "e3d0caded3fbcfd3060d9236c5bfb6c814b298ea57d558264c1e381f1cf3bcb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "e4d614aba8a21bdff143c8ed7076c6350ee45893ef43ac9a91509cea0a02c6b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "767504669a0763d23a5b3ca15101eac6c5afc4ee01f60bc5ab12d42e9b20a459"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "47379a1e4ec2d75679f84a922006aa51c6ecb6320ba636a17f799175986be66e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "92342d5bd68549c224f24f42ed41b86417cc5f8a2214ab83e30781d7a921f152"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "59cdaf93ee32a5c96623b0e461d4fab211ce9b635ffa6c9bb989b9bbb3b58338"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "c79a433435cb023fd6026df214e2d8bdb79d2fe7b2b12503f46e47594cf8dcce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "8404197c2768433bbb85f901e6281a80b5e586bc39cda34b2de1542c93091ddb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "c88c4d161b6eac46ec47030bf3fdfcdcfc34b04c8bfb4309aa243e6d971c3c7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "5501bf5277488e32a85402e9eabb3e3c1d893aff6030c60a954c1b1b3c4cedff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "e94343c68e58254048775c5928a164355d1387405bf9b9e1b657c5ba0f21de6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "c080844286a33c97074d5c8ed5d859b2191dadbb6b854f114158001a022c6493"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "30dd4b60de6c4b39eca83148da948b80c35bd23e782f75294fefb11a25df0407"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "61ece72202ab82791b1444305bd98821e4faf8992fbae8ebdfb95afabdf7f3ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "8f2d7f300dd433afc543859ea77c7fe4919f18ea7fab33e318cadb007641c141"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "176ce3a5c3de115d5c55de12eccb3242ee2200709cad784777ec8820fcedc934"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "3bb70edc79917b08c4e54fe2f601d277730ab419d0069d6b7226e9d468307f8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "5fb4b4ba1d1a1e4b89c3afb39d30288f462b7cab9cbf7f7800d5ab0644930eb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "a26a7a8bff27c65d3a69d952312d8d64c9b16df237849009382775c6c39dddd3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "650f4cc311c4c0b8c1cc3a6ca4a05a38667fd53dd8970773708cfa6d3c10d9f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "3f0516d56b3eae37df2d9d7cf26e138b32a5f4f52acce0f02b42dbff15fea5dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "0613df913d21d779bc2a7715abd21d7a17d49df92271ac72e189231de7063d8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "a88a8eb2b49f03b5b9d7e58f1521f95199b2d1eb79be26d8df78caa8a9125920"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "77ff4c94c089abd2cdd153f4013678633eadb7e678b767165501126504127b2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "8b46dfb664168af544e737a44ec6ac22082b77b37e97b3e9f4fa59ab7507c88c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "a5ec66b1c817de76e762a489ba99648872bb6ccf8964205e728f595b8b201be5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "bd330376534f615667afe2ce929bbd37b562624e1b6d07f09fe75f43e16a4369"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "d6aa998b940606ac17e3a408d32b8d17820c41fb46503a3f6a78be75296b6492"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "6a0235f54283907de5cc874080e55141e4b0db661621c76bc39e39daf03a2f97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "6d975885d649ca739b2b61ce3266a33eab764731730edaad7fbd3e3c6be36429"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "a546e6c2ebda6d43c1d30e9e3b37f0b79d2387d81237110dd432d31feffc3680"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "d9f32a0f1dd951ac533f6c8cdfaf50cde236d921f6f7be6ef10a97fb89774d4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "92aaa49c40434f35511c7ebe82219b82e862737e8b41a1988c9146832be2db42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "df50e5e5f54a6944b50b88adc579edf91010ce340321defaf088b606b74afa36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "8e15218ea34e583a9f63df076237baa0c6d6d118de0d6bb2eac2aef119f330a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "674baf112d109a907674356f288184d3331e33cebaaefc63e865b176a1cfa1a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "d4455365ee840f49cd52e3fdec67313eba8294cb21f5ed55fd88090b0684f8a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "16b20d47045f3a0fac77dbb92b0ea55c4f1c6d0f7a28642565ccadb37fe795b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "26517f8331784c4a34633313378375aec0a3760edc4ad68b9406be564decf0e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a5944b119ae941eaa3dc665333143635eb92e58be8915af5d8c0c81d5e2858f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "29f3a117611c9b231ce17f3aef524524bf9ed420b77a539fa906eff658ad39f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "1f7df49daf644ba3bcf11090678dcade1d65fd0d65e5e4933057fa36d1e4f905"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "d515415703713a5aa59cbdb4d89901fa695ca0cfe31454b767418ffefaa8bfce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "eff4b274a33fcd2c81d5eed53c6a83541775ac0e51123ccdd5f8ab198db614f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "97382d10d633499b261250f93613f2671bd5b6297ab16a461ed6654964afa634"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "54f88e80b810152e629217df192c6ef0c970c88b4659ea14e7431bb5156ddd90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "8eac27a9e78c7f925909e2a3007d7766a6f0c151e8ca3f4bec7374098dff82b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "50a4bc382ad20d384b4cc247c4d39e1dea035fa253c9ad386d1a892a1857d64e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "5eeb9adccf0e2ad09d9b5ce07a5de663247d267c037bacf1745f50ddadc75f55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "1092a6ff126c35d2ab2944a7e87d51e374a6969f087a75aa81da7a77b3ce2210"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "407312d2a72cd55c3a4b1c141190fb18225c122d52a1146c803efdef440ffa8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "a2e0ecf871a5f103bd7c75bc9c272e81c1b379de7e962069380852dc165a13b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "c57202940ed7719b82dac4f79dc4ac887cb3b8039987511fac143af05a2008f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "ff1562535f7b62821e13e05102bb3dce0b1475fd22fcc7b7637a5fbb36e25bc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "30fd05ce189b8875afa0dfd7e8719094c8ca91b5aa921a860b602e731a014479"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "8058d7ec2317a9dae27c1968347b10a2a4f0ce4a19f7d6db677b5a0e2f272dbf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "d525c1d7218704f9b645d55396e426b3ccc9b082452913be7df1e6f46eda179b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "21e6cfb3b6a44b711b626ba47e9eac265f0efb1556894d47dc4baa0b7496a244"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "3a2680e576da307f7d4735d4e0d82b6cc070c2272cba70114558c5959a84991d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "f740fff467ee45a1e6caafeaa95cf67a481bb08445bbf6cb1024f16c615b6f7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "6279c5df0a2873e1d4482baa81bf25b6e9a3fec9d9070099c2aa5e77c4aec098"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "8dec3cf5d0b6dd067dde0800f312dcd73766192a05d381d035c87f109c4ae8ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "60c6816df5a73e01a00c8b5dface7841fabddc7ea4cfceacd4a69c23dd055d43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "b21fc53e42d4212cf2eb73905db5ce6b3e4a01ba6ae6b9f203840af7f20f1278"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "cc3957d36ce49b8f1ae3174d4ec9ed3086735042c6aa5f1a5e1bf63fe2432f13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "a9378035636626a34c8033841bfec6cfa5f912fa29df36cc3c7e5ffa45b86e9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "c8d9a7b1495da43d767570354a3d5e0012ea20426cb6c7bbcf5f27beb0c4c3ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "50f33485840c51b9ba9a311497e30d57c001a08d5badbe0f0a52c3e6ec4964a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "27049a12e4931f5fdf6fa44c57bf5031cd4e8d57b7a13e6628c0974eee580c6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "7571c0060179e188178ba3dba4c8fc076ca8154a452c0f88900df3e46d5999a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "68546f0d3b5a9edb27a5c2a1c210151213688724450416e204a9468dec73e2f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "3acc03dd461b9c2ea5c6efc1d2517a5c568ccd254ec797de98503024b6b5c221"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "33b1271cd7648ad64247d428b64fcc139bd85b6f7b3a9330880599f12dca75d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "c8124b7d385a1455a6c3465acb5f658e760bfae9a36f9d4b791be68a374f92d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "73d671ac081f4b96a0d15814c3c65e27789c9ce80c07a2f7aafbb0adcaa15626"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "5173526c2a40340a29f8d8506f06c700cc4ea9f6cac589ee9c6b21eb24a57ca1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "c02b19206b9432b70920e7c9a5b39261cd980f5be8590dbd51793243e287e6d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "4d5a906c201f58740ec6731b8ae6b17d9bc92d2b85315d8a65f97dc3b580fcec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "209905b0384c3f8135ecf713796b415c2ce76dc64a3492deffc4b0b7238745c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "4703053ba0d4b46728d67f8ca17036035208bd1c5876bb98ecd85a0434f6f4c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "90c6706bc6d8eea35cc66ec4bff12eb62e55afbe7cc808b9fb46762a7b867fa6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "d76265b1560a2e371c207bd18f356a61861d851e8f7893e14b86f8f8a7aa9240"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 168048, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "d4d375392a760814bb2de899f21505e2acc027dd09c8b6d78060bd507889e279"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "7c4f24339a74e1135a76af37c8b12fe43ca203095176587ea8340c5fe210ebd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "05f6bd0f1fcfb96843039f1d9f362bcc9de2655e6b669c3b323acbc974acac36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "8c76f6d724c58fe200120893e3db1136886de89875a646fabf9c7f326a5d828b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 168048, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "357c5a8c40c595cb44641a6321dfa266badb15d59f90e8d869f260dc27ff5e7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "6acaf2e74da50ca065c83f4c7b85920e2a9bdcb7da520c41dcff9b87f90c3405"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "ef8391d0d297e34bde53e9ad269d6f745b34391484f758741168945bc9ac624b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "9e78c0eb42aceb5be9f78071942d155b01f8cbec3fd43504b98f22e8980fed40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "4b34ea8b837f9ec39cb7fc2634c018eed715f5227ad3062fedfc33fd40694eb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "61148b7b6f7cdf856a94af6bd60b654d871b6a920a007c854f8f4dedae033f42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "98512a566575c49866f22136513d6db52895048b40f418be68a518440ef727ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "788be2b334163e08191118f9493eb024f47d44095c2015a737172a2d2e8f67c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "41000beb40a41571db2e0d2327a85c9e550cfbe14c2f92dadb46c3cae29a9b59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "a0abffe57d4409d0a3046ebf9cc673ee0f9097813617f40271b6ef6db47ea24c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "d063416150200bddf5d1acd50132247f5e900b71c07a6aead1fcf0e2942287c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "35b7cfd73f2aae14d3a3fff441b0134851279a0d63314467bc9ed62c7025e3e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "13b0516174c4d778ec9272270ef89f737fb405839f0ad1a7ff42379324e0d4c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "8546ee0035eaba99cc4a0692cf6d64050d52860b9d49b464a1cef0ef3ee6fe8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "f9ebeb1fd629a887fca938ca82b106fb20636414103642eabee8290f235be635"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "0264c192fc255dc967a027a01d7f80c1852792967f2473d40a814f4c7b0aa7e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "21aa85cdccdc7e8c188dfb0370ea3a95e48368b571c37cf45b2bfc789cd14cfb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "115a7fcac4f2e386d557525e1b3dcd3c68b21a45883647bc514e695ef3adae8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "1fb57f95c127a010a153a0aed0cc7c1b136c40c18234605c3d5fc96f3df85f99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "61cd3d2b514f0ed0d47f4514f8f97fa1ba96f580931145732afb291b62e98fcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "760382b456f1e6b9127248d6b06c5d86e4e186e995b121c4e69b134347939a81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "8e2b43907af501c0a1915c399f611c34b9f5b7ba1beceee7ab9ff6fc38e8e13b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "6598b21c03b4cdb34f959d27f93aab3748a500905785af713fc41e572da86ae6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "89fe248dd86026369d64e5af8c30ec910072f79aa689666c524ac452bbc571ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "57b2824013bf3511c5cb1e25b1f91ba0fe36ca4fb3c0cb0fd00f6e1720dafb24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "9a56fdd07ccc4ee0da773e9938913862ae0f087511d682b5998bd06f392627d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "1db2feb6b867814c23016e3dd6030764e267005bd44d9f42c365e1e5c2544535"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "01ed0a54391008c2e8be006745bb3c1c9d25f7eb8ae7b25915a8ac79c4bda128"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "f1842a25625d4a8dcec33852e78ecfcd7e5a1609f8d8aa544619c5cdeed64f52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "a6aa3f644b10fa6420941533e776d22d3ee0115204aeb2c322c6d90c6f9916f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "5a4258f638683ce15d23ece5b0fc3a45044aaf1b62c8b6f1dabfd05e5efb40d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "11cc0da684d24c021325d1bc47622d25af1992cc7ebab63ef8e2d00f07b18c07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "58f682a4dedb3dd3fb473e7fb300bbd3dc9b65955053b94ac4bae69f72d41621"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "53e6bd7718aa086ebfca041d65e7494906c26db2e0769b0802072224671eb625"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "7244801147314fdad463dd13361a8c1452fee91184177c1b26d8a50089b24227"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "682c46ccb29860e40aa6878e7165509c26879b4f35b148be80de85f40655ee16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "5072ceb37e8c0b64aa0720f1a55bed2f19c78c53868e781abfe0876e02a382d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "6a04e95b3c0f9d6ac69e38a4dd0de7d2749c80d4254d2d4c80c0d0c2e50592a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "8ae3960b40e2f8b96e3153b86387853068b029f674bad79162d75aa7288b7c63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "37462a498c0edad717cf31daa6a3a20245ed066a0055021f3daaf48e494d2aaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "7d5de7dbec7ae8c999dbc5cef77bf93d38ce76b612610c9635ff98dd677be06f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "24b5bd9e72fcbc90e2dab0fd4158feee3cd3f435d1cc53a9203d9f814924285e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 118000, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "368254bdaa98931682eb4fecb2bd6785832b8a77373954d4e96293ce2f17c1d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "28d8e40a668946b4427c3a7ab71061bc9fb469177f9e850967d5563fc7fa5edd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 118000, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "bbb81888c847c737e78b0253d112f1a4b1abe356cad9e049411d30f42db151f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "149e960c6742ca7e18b2656c09b0a99a52c3ecbacefe30792cbd6c510ad8704b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "0cf9d3dd748da479d9022fe3158525829405b6bb0656f7cf1ac47f6174e2d418"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "e865731fd3556b83a1ce477310f4dd5e0b72e42ba98881635edda1f62acba5b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "9aeda1832c29b3c91c13949effde82607ac4fb0ed56feda2b44c78a0d4116d95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "f6e2d271a8f45118edafdbc7599666ec0378a15ceb91c81c3a9f8732bfc19a48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "8b24468c02e193f449ca2d7d9b3504609c360a322735619de9ae4c48c190fc8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 118800, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "528566a451ee2d5bddc357b94c91a8d38362988f7cd6fc74024613121d708ebc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 118896, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "22d5dbce4a86d8dbb6601693a1c39dcdf153afafe3444fc723c8b0a4b957941c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 118800, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "344185369b985cb9b2b56680c03ef4bac93e21cff968c682fb52e8665ac63310"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 118000, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, "20bf0d9ce4805e19e270352d332b40025d2979a663a745c9fc95d63a71151a4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, "f68c7a191ab05bee96d2990ad701b423108fa6b1aea9aaf5294c41505a8aa4f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 118000, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, "64be1d15fbc6540ad5f0a2c6db4885bec386d8d59076528d264b42f644baf429"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 117904, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, "fdcf9a838d1f2d84092c68a6e820831ba4926d288917b039131b5cfbe0c669ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "6ea9ba426be6b614ba3aa8796a8bb13156653ff9078888af329c42925a2018c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200912, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "420087185168a66b8786998a4c4b955f85a2d4bc581f6861d6142e2414031fcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "08f901fb57b10ff3ef128d392dd97436a3bdd23f9cd4c294c32991c2d8af6482"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "98c436b3087daddebd14a3051356e12f43f675c8de9a8ff7f9bf96256051cdde"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193232, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "71d859b6fd7cccf1b6ba1c0d699cfab69aea931c13bf263009dcf2b714a5dd46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "d10da15d12266508abee61864314b9817ae00984b1d5fe64552296b76c8a8965"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "232e898d7ba613aa1c76d7e4931aa52877dbf727d7302e78673e058abc5bb7c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "124f8ca3389fb7c58e93ccfcbf08e99b24f6fb812b4594146236cbbf4bfac985"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "a88ad123ac138957a85c64ef4655ae28e214cde6551490c6ad223351cb5ce796"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "67d97c229a6ca5271d24b825b659d94a053c76a00fe3d71e7cda88f4499dca2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "1562488544dbc50f430a86a6bb612e11cb5c4d0ca816370611a727cb480a0336"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "8302be4df42969dd000af7f80de55f79030557b1939bf661a3c594ed39d4a09a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 179440, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "b9511a2be0cc3370e077491da909a389324edfa92b4d0b0877d577ec6aebf845"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "3c76bf937a36839780eb88a7e96140b0961aa5b774271ee51c14f66ddc96cf4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217184, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "d08adb403c7dc4ae2522d1ba28e0d85c8d880dcb18522b74c53f3454fc8d16a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "ae31d4100b3bbe1fd1b03ecbf8112659200b799b3352015dd749e2e65c1486b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "5ea6551e05e2fdf4c835cfc0d378318e5c99712be0f48b2351555838c2e66e27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "c925ab7fcf0e3f6e0adf45c111279397beda91843eab7c0a02af4518e5ab44fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 165616, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "196affc030253079a2e2117fecaa28ad612f61685cef8467bec630cb4df146d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "c14aead30b80b6abde36100ad3e615c45ccfdd0122647ebc1e1ccd88b9f5b31b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200912, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "a170a53be593c1d15d558a09316eec081823332ec51da1512527320f5d2fb023"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200912, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "3a54886cae9b1da3d7c330082cc83bdc9c34c59c4e30191d5791f6c0713e2ddb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "5cbe04ff1d3637a44f2009e6c32380172d2ac19f49c58870179d7fc382ecf142"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 193232, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "861c57fae8fda8286b766250766716e29158c9c9a3b668c162dec908b86a21ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 193232, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "4543d15cdbe4eaab3ecdaac87329df2fa397c062a859186e8eb4cf0f35dc03d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "dc841f514475e3315397061971e880595e430df4610a772163939a5a12abab1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "44a2cdedd302e637f15daf4f4a99b59eebe285ab110e2e71343f549f8dfcaddb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "c01ff7c4bf345f6309c8cc969df7ff9a2564128d4cbf832b094e13f782ef5028"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "90ed4b7f7e0765fb98d5db66e91aaa4e3fa087483acf3c26517cd2647a82458b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "dbd3bcf024d918840bfa605589084b14db4e015f59e2707d9a7e466b115ff435"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "2292b878b52dfb75b804ef1a21a1c92e0178b37c403924bbb65a18c9a7ef5722"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "76e4279052e0bf72f9aad20e549639883bcd42e310a545e0cca57312891310fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 179440, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "ced3b103c2f649b318d785915f1d815c76333fd4574f3160aa14cff8c9c09654"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "2b19e70b31e26fcf3c40104b8a300164a7a3429b614f16e35aa877da191f83e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217184, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "8de99b57b1821595eadd75144134b556cfc32c6487c119db4eba3427bd4855e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "c92d8a321a0139db959f2cd529450df3de72f5ef0ab8d9128974f60da3e904db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "237f472c9ca2fabc7beeff12d21b046a5bd78dbd3f756fab19f1d553858a85d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "8a42aafaa7a3bbece699977b7cf03aa48c35b93111c4a59b9fcbc0270abbb092"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 165616, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "b6c9cd78babe60d12d4fbf258d2af23b8e633586a18f5cd7186e5d681d5af996"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "78c262195ac133d275ec110549a200099e93a786289266eb6895d3579f1f91a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "58186c2a938adf9f8b6b1adf64beb8497ffd2963fd0c75db20bd53bf4d5061c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200400, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "83898459ed4149c26477e70dd3512f3829bf95c8f380e1fe29d0b6865c6644d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 218280, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, "22c329b8dd6fd575e6853595d1b0ee3ec9e064e100dfa97a0fe8b272dbcbafa9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "7497f4ad9865c9118e7eba009c12ec8bff201d83ca388a9781d93877a1e86468"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "e345b1ac6397192a25c0e482f92e57a029e5f69c6741acfb696596664988a264"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "e0e0095323ded6c378a9aadbf145fd0bdea881d35d37043545f859e77b9a41a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "7919251d506b450486beeb421347e432dd9b4b171cc06933919b80933f00f913"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "7356d0abf54f63f094bc27b504047e3738321af291b181907e82d95a609995f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 218256, 384, 2, 32, 0, 3, 64, 0, 1, true, false, true, "416d9433b3eaf411150283bb2dd785130b567c494f07e36bf92aa816551d56c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "cef1b517cae64be600918b5ccd38ae6f0fc91043cd8e012ada9b164776a6a064"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "e639526e3889ccc667619ccf557a38fa959d8781119b73048ee2a19e94c035a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "ffb032b362fffd8dc7a984fec321dec4d0bde7066ecbb10074f6c245466e3b89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "80fe2eca9823bc8dab4523902134920706c17b0106100ede05b06ce58dcdb242"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "342b5a51e2f9c20762a6c5cbf8c15e494216ae1c7de46ac480cb10769a44bb77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 179440, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "a1618ff68c284ed3f99e768584e9a1726febdb4d1539edb44557fb932e6b93b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "ac6b3405cac9d50383ce1ac420a19ea6802f30b8b57683b7dba0abaf8e3afd9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 218352, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, "8ec4210b7a4a1f25605bf934fc43d7dae20f8e4873ab4faf37b32a9acd661750"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217184, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "0ab1d19d99caca4455bb3bdffa5775b06f17e75b04139c9a0148549b926b6a35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 218256, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, "e25e63d9827639ce98c65437554f5f8e4b907df97a562db95895ce328be8c8c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "b34debe59c2a6f6ab5b3dcfd0098d3927833e6e6914dd86b3e02a248e2c0f8ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "af858f31bf4143a63d67abaebaef47bdaa4637966429eb6ed2c43cf58e457bbb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "be2143c8f5382bb52310288d3926a1083482569fc198892eb7ecab249ea1c1f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 165616, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "96a5612099688e661e3f38bf3c02a147a734fa01dfb770e91811b0a161db2424"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "b8017d11785b30a1298f7cc60424f78ab316de5b02e6bac0c5ef0a7f9fc1c406"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200400, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "f56c326ba3bc7e13bef90217306b5e3119881cbe39f0bdb45c9f4be802b70c9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200400, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "c82608152741ea53f924c2243842ae10fbfc1a622898c20ddd6b7a5b80f79514"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 218280, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, "c94c934570dbd0b8dcf8e1542d5d0b8848e9cb28f9654b4feab16537e90c5bf7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "e165a47b4c3d29dfae663711fa5e88af3a5540fceaa0a93b0025ee0d4a97d0b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "9a9c132f3e2380da6bdf52c5d04aaa8a085535f3faf7772a79c2df3704be9013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "bd402813d002fe9baa1b257582bd9b7db277e584afa0cc2e3024a33cda37c43c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "5285f350de01e9e80a10ef936534e1d04651b17815b625d7d06ae105382c907c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "7cf358af66d1c3889acf2d69448e9793a2ce216511c8c6ff13c887c5c7e7ae04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128Static2CtaKeepsAbForGen", 218256, 384, 2, 64, 0, 3, 64, 0, 1, true, false, true, "17a8b822df506ecb01e6966e1c075a07c2cd88ded1771d17db71e24d019dcf6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "49aa4edd2b898d24d8801b036e922177e3c85a349e884ad6ef57b4ab79732ef3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "b5767be531d45e1eac9122b728ab3a738e4de95a315fbddaafe6bc46faf0b5d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "463c2d251306679f784b53c5c8ad81528607caa4f0867eded76d2a7c2f82729f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "ea92513dbb4f3570667dfad2fd04511eaf7efe1470a6e4caccb38051c8d7a194"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "767205cb242d7cf334c8931b24cad45cd46789234f9ee6f9a4e3ad39f10df989"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 179440, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "a2f6824eff4dc180e038cc9a1e529394c8ccd4164e89bd256b59ab6af70c4fe2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "ed03bc955d773ad34c88baa6d5cbe24dcfbef7f3fe6106e3f0d32bdadfbff0c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 218352, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, "27c6b6631db27a5989c0c25e78f828755fa5be4fce3066e81be3e2ec0e1585c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217184, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "8218fc64df3362acf3c9d708d14e469b68811786e5d72900e6ee8c032e4be4d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 218256, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, "d07ec00b58629f3a667f2d38a3ae3c652ee90dd82bd0cbf270d51460de35e768"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "da81bacf2b0abe7a0cf00cc3d6f142a93364088a16a90b853e854b7af6fd33e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "8ee6203b787bcfb4926b4bff2c08b2035dd28b6c8568553c687963114ce4bf4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "98cbf95fcc78ddb39d4e39e421431c473be8cdfedd378609132e868a80060a20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 165616, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "8487589bff6bc3039ca773fadd681dbf48117d695193cd7dc1e300d652d80607"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "0758c141b27fc55ffab60feab0cc36aca68e5a76657b68ce2267539cd55680c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "ef5974f61b299d9ee16c65a19431fda4af9a86d28816e5619d1a5908a8c2f757"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200144, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "6d432fb223deaaf5400a93aa7fe1fadf236ec40e84cd422bdb182e10446a8f55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, "962b6a84efe23edd45c096f0218e5633ee4f916265d87124b75acae5ed7a9d23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "67aa999ca66601ee08de90d99d72f7a3b8a1aa28649a62f6b33b70ce01a1c11d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192464, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "71806ce8321a463e9bed4d5a4366530c48071b55b5be6413a24f000f6c0e890e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "6f72ea46ff408f71677211f52ae855348625fbfa3feff4d1bb075baa6463b440"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "911aaf08f7aa61d579cc9a5662defcf3684ada8457393270e67e7342873abb64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 32, 0, 3, 64, 0, 1, true, false, false, "0ea01aab6ee0c1e32e58cebcbcbd140de077e9399cbe424011b810bbdac33bea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "9299b793e39930f16e57e3d19e9e3a07dfb8c1f1895f50c4a9433f9a471c1f0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "b0aabf215af5831e44c0dc5ecf4f4b9087e8dd6b5195ab0a5a21d15bf993915b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 181488, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "d57516d23a19b6658285a59b5e2756c43b15554c735d6079b1325f3f8a6b573d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "d084da00dde1a8104ce88385386721ee65e1fbc9ea62a4dda4973df559547a3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 179440, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "5d9e87353e67d6fd64a4bea91cb30e91b7ac1a6138801f183cc7b925cf2e01a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "74ab923139f238bcfb939ff4e022a7b512f828da349fda0fc2f75ae77579c26a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 217184, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, "f6f92521b21cc4b2b990f93f9fde4625fefaf9a5e27110ad72452e326d782952"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, "84ecceef42d55d1d81e2557359aa24e76c8ccc8ae877805cb23b0a8f29d29af9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "61c4a75a08a50422377e11dd969ae2703a7a0391ae1f9ea73e961735d473f09c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "972e01e5e665241ce5b4e44517baf47bcb1318adab01f978be304269ff51a4a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 165616, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "31a600278d021592985228652189ffd67d7bf45ee4f6a407a5875d584a2bd6fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e9173dea9ee0bb882240316e70707da7f66ddd55ee286e51a2810709fad1acc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200144, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "0022f3d311d7a7f08abaff108b858f56e0cce19c25f7c7dcd8d8cbd4bad74a95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 200144, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "1271160ae3681df4b03826297cc344e5d8799bf85b9d4ceeb4a3496341631390"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 217128, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, "f5d896bf0f4bc1cce077b6beba1c5766623672b148c1af9f50b122bc99de8793"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192464, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "738bcde2bb2c40973f0d83fa2d8aa1c2cd0f70bbfc4d32f21dcccd7e519537cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 192464, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "adf0afcfc98bef70edefdb138cbb1e7e021b66aa5c016f25a5ba3c1b932e3ec9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "05d8ba893097f5fd761d5e47ec642327889a2fb93cb999f88723c78b50ff4b8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "42c4b2745013b9c27fd7743f933f3ea4bb7db278179a65bc69b94ea231af9eb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 217104, 384, 2, 64, 0, 3, 64, 0, 1, true, false, false, "e2bb4cf8194081d342bfe2bddbcb33d34333c0cfb3f8d4fe02003ef02bf8e419"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "338b8a7eabd050cdd83cb6f4d02edea29ea7e8ab386ef52ce0da0ca6352fca27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "cef2aa070b91f4c5cf73fcb6ee92887f777ed923bc591fea6f6cc4ef2eedf3ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 181488, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "c1998712d45d3c023162c7d5dd5b23e3abd7b48771544acf41f709c99d6330c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "710ec679dd31632c348b84797dd105dce6b27bbba1189e7e9f7feddfaa24a44a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 179440, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "ddc59f07ed1b18c542f47e0f9f597ce9b8a368f716d05d4cee5981e5a0342336"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 167056, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "413c0ea014dc0c53f0a81a2c27a1129947dc978f52e4deeea7b4d69487d6bcb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 217184, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, "4a64da9ded26e31eaa7e3721599d9c9fe0e6febf5f179c323af699c284ffd0ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 217088, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, "9a8c0b9010cc5ceaae6c8520de5f587b42a7ecf635c59228bd8582fe538f939f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 166640, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "d7a19fa7def4d186e4813a44c6ba5296e4a86a01361df023ee45d8818110e534"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "cc237e9b95684a55d4719283184670a8829236cd605436a21e568911ea602b5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 165616, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "8feb20cbccfa3625886b7bfcb136206f641fa5ab16f446645b426e61eae28507"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 159376, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "eb5259fccbfed0679145561dc4cb6fa64e60e7eb3d08dd95c1ce5c36b86a3d4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "1bfd274fc88bba29c0c3b580e1570b6f959713420681f58be01de51e4e126eb1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "e00dc1994dab8140cc38a8b8d5dc2977a7fc3bb6c19835b330b9f6df45077506"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "3f67e93d9642a24974c330ee708ab6114e77dc9124aaf1f0e0e2f6f00ddfc8b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "f0b05167c49c8e348934ddcc493a3e1b902ac9339dfc47eae1ef78a943d7d40e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "4650ba11f3927463a41d71bff3e49256f1cfbd5663858049bc26914ec52b30a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "a23feb6977cd799007a09fbb5eced925b40c6f18559f8565ed53742789d5450f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "6e1e7484ceb689dfae546c11a6510243dc1373e9707cd10d39b9a425cb6b792a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "352fd36236b23eaa8f0b3d3a3d4ed1a67d3ef747264ca487a46adc3130039e29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "3eb9ff8ba9b500c71b56897c05e415d41447efb4b672bf8342b5e872dc06d8b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "04e3149bbe0422a5a2999746489895a93e59afff87c60c02401fc476d1003c40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "2a777d1a31b935057295af16e39b619144972d19bf4487a30e94d92f0142370f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "6d94a385811110ca7727a152b5cf86f80f33617ec71f1d1b944727f336b64f5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "0c50ab900d9af242b12ec07702533370545cc60e75dee4fb62a0b05d21c34625"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "fad44fc07f4010b8c48709b7a1e3693f4c76cad61337e6547797f291eb964bc3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "5b98aef58d9cd1809e31a58ff56cb1ef009f367dc192e68545db38bb80ae9835"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "7b5118fe3867da7f1d7ce6e2bc95257b6b6d49d334af8d45f55d5656993bff74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "abf94706980213a7d8818030c5f064883fd4958a8f5d5e0c27935a7932b8bd95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "a6c8ff550d1eebd567236cec9ae025e6170f9365767e832a617b73ecdf441f1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "7f7ae85512b5c1ae124df726a1a6b7a7819dbd9e53ac5885315a3a03533fe4f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a3b089408c6245f50ce12562338c6b8947b923d14c743981273a018411e996cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "385bf88bddad133e343aeb0eeec10823f609762e4894fd0e13c72ac85d002cd2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "a9d238b89e757fd8e61acea0c8eee692af14c723732d8e6c1aff16f3a0f2c86f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "ff7a8a5c00f05238b8cc779896dc0c0e188f2c3aedc1cc4a3ee3c57b527e3850"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "e051ddf573a309455c93d039e6948bf5e487f7da837c0a31ddb8e8360ae5dd51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "9dc1b35ea680fbc45f7a7efb2c51deef2aebdbbcf29736e1ffa5be5079fb3ac4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "7f6bc08205f18b8074559d12a93a3b833b4848cf56708fcac9240cfd09d6ce65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "64f06c7d0ef8801d3d0f5a235ba13119ba4a6a2391e691ef65ae63c328236f91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "1086ca71f94e7376f466b41bc463bfb7dbfdb6aaf4f59231fb0893586ebfd891"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "9b51a19ecf774d4dec03ae87484df4ab71042e7e56f35bb2053bebcd7384ddbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "c62a62d47500b9846d5f554325e9cfa1358cec69abcb3ce0f45c021507199b16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "cc80c531dd41757668cbada42dcfbf634afac28d4ba02997d8aad4afe4f284cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "fd9fae9aafb176c538bc0f1b7e084eb1283988eace8fd7dfa767933751eaea7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "d8e9645ef618587ddcb7804e237992b09818f6bbfecc31b47815bed297720bb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "e23a41c877718a10091baaf8f3b604f9b713c443b3fe0463e8bc115da7349110"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "8231cacfb6c810376d26c73f0aedc0a5fdc75eb087e3da1f6c39f03a96228b7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "ac8952b3be2a4040b75af6d14b57d8e006162f383fbc97cb04f1d6f4a199f289"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "91a1abb7869cf1240b465304df0fc84a941e78f84d7b2d9c3e801dc7a6e82b8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "4ceba075a2c7d5819fb6f9eec25362cde1ed832b14ed0ba3af99eec0e0ef62e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "3745bc29fc8272daf660833b19cd8498e5296fe1a99b332f8611dd9ab9ad931d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "d8c60e764324b0f611a357dfaac568307d7469e316c95c39b33ccfe43bc9e7b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "f9149553ca6be7a9113ceef7ea344a6b380eacd9f0469547d11b1ed501e8fb89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "46a0da780e894626772de41498b869ed55fef597ac5302e5cfc34f653f15832b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "a2391ba29fccd12e40153490a33c60ec2ac4c6da47adbfa798bc4202d4722122"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "ba9232377c0fdd3ace7faf20e75251350fbdc0f39ebf3427fd74de4bf4b9fc86"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "91210939f7d8b655e0328058f8cd71ca3bcbc0a1fcf9f80b03736a835219f6ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "ea9211f3a2aa2ceae80ea505d372c13469959d957d62d3a511b83a21623fbc9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "3188fc732ada28314e8c4d467180c8c14b84f4f5962cf9ac61ecceca0b23f428"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "2db7f3610a7d9ab5d5f60b4ee0535658561e4fa8f66e0a3f9bb52b40cdd6f6f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "aab3ca4c37f8417b697805e7b7ff7faaa2e923643b6c9b0f08d848e84c340eb3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "7b2ac1649e267794580c7a964a634869ad2f1e3efb9f55df32b2df69559ff6d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "ae553b7ea126859d2e8ba2288d3327eb30bd74cfa15c3b51fc5602fc579a20c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "c4cd847f16d6c0825f43edfa9b95d1fa9ede6fb04c4578589e3047ed485f080d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "42a5bb933bbad0a7ceea945007c77b3981902563d542a09409ef6462b07cab90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "1775c7e31a189d0aaf5cdc71eb8dbd319fcb98ca6421db368cb0d1a227357c84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "51ba94eff889de64ea4ad51f7b5ea718a7b39d45bcf51c8e24c1afdb795201ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "68845c9f0a030faf4543f73ce524d888324f32240372298c4aa858ab433d4da5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "fa2414ae89be4a1dca230fe231d2e6930f6dabfe02546405fded35d2d1e08129"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "95a834df082467307a0c414cad6c6d2558165d65666fd07eeb6a4bcae5be4b0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "b66e0efa4ad20cc27f389e99783675566eafde1f6c70d9f23c0030ccea4a5096"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "863882c19e017753c7fbb63bca5bccd68296a1091262ae111d5226ee092b6692"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "8e9954a413e4d8dda7ff3add9c4adf651efce9b26a421a84f02ac3a9d93cbcac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "ea889e943841c2e9027b9d91c6f13d7c040e410966990bde04af4d8414228efc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "df88e38054614646513be5de24117bfabef4d79af9736c965f087e3cd0fad40d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "390f647c2803c5a5f195e694d98c22f25a6c1a9687b5b5dd331a35e4ebb7f550"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "98ffafe779662ed1f9ca5b9e965dd3bc5f6a35b07cf5ff1dfab68c1c64f51d90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "20dc38c6a4ae677254c5e86af6bc3ce887c4e065d4e8909628835915d516970b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "b893e7528cad07990e167f7069be8a2d6029e52f1a8cceb1c83d59510dba6236"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "ba195bff628462dfdb0975670e4217d74126c11138729ec6ec70633124c75d23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "2aba736a8ee26f798003114ab8c9e6f9a949d95c549f0d44a9f4649c18754dfa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a4d7d4d5ba5b41555ecaf1119fec1c695718bbac7409230f57b57dfc876661f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "6c9b6ab718f3338924c0c89bdf0ec8a04167d83c781f63160ac5e1d55b12941b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "e94d8e980fa3d2f60bbefe28c062ed62f3fb7ecf7c235ab868e8f8eed45b97a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "a938ec3ed9d616b06484d5d157088ad4c869a641e45d3623e525528935ad1835"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "4eb41ecb5951f6a7c2597fd9e3b593ed6524395ec50cb66e5b44c94d6c73eef3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "6289bb584378b5be0713b503e538d7d0c8e5902279dfc832485a987f243945ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c2ba16f707048615130e9c2d778fdd337e0bb7f3446731018326ac433005a161"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "b571bcab9d7eacfdcc1739141f9a6e380ca1e750df97d0012a7970600a003402"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "7a5f90fe94597ca4f23b700141d8494d42716bccb35bfe581bb9b050596c315c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "5461eee8c3a0810f42dbc1ae06d11de023ad4f61461b61df82e1ba17fcbd99d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "733069de65b5cdf317325b746d27cc220be49059a64bf6f08c72b17c41af0732"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "4e6898ba09a54bfb4bf61ef87ea8404c730adccd2a24af3f9dcd36c41fcf9011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "b509132bef97ad04974103142c829471885418eb8d839f8afb60b61c533b9fee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "567bd963daa3f3cf40adc3e43593a38941b9ac25e0fe47d1d749be45e1e5ddac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "9518314aeb44aaee1171fd0e884ff5ba2644c549a4d611d35280e972e8ef5d23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "b25029f940f222652303fdd3f7831d1b2092170979db7c8104ed57452cfe7778"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "9149a034a70326e5737f7232ef884b9f37274bfceac6733868f724fc8277eaa8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "b822141dbccaf11296c23c98419d9f3803730f75611b68acb0e80288e9580de1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "e756902bae346018f44ec1cfa8a852227e3e89db56d35d220e0384d57f784c34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "f36242be913e90cf62e76bef34aaa5426ec1686f287d9e0b3f247ec66baba534"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "09a242b8fe99f9138ff8c7d5e767d7ead7a6be425dba2b04d0b163bc74c65af3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "78b5880f7c984c814380cfd8bcc24ad0f95e80d24cf78fedccdffa25489420ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "00bfeec6c7b7ce920445a6c3ab82dae2f534b3ab9c026265637f11d08e78eb13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "b9ef9d462162d326362725e670880538de3ffcb7197e77ceda5353eddfa06203"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "a1b99e2efbb125024ff3fee6f99ed924122ddf6796e3a2bc035bf3425476c7cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "ca44f439102e0b609679f621f3bb22cf31659cd1a6a6736053d3a680ce0bfdbf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "d263b70eb7b46a5c5967d90e9de72d27bbbbac43634895283c06d3f8b6431ea7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "68edc1f88ee62ebb2c14f79cee61bf1b0eb47b552c6f469958295bc27e2d1a6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "86601f1673bce72915a3ac6558a6d81041d58ee197ccca7edb6c808168de2ff6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "ecd11272326a11433c3a5eda42fca6a12a8680395ba87c70b218809a4f051f6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "4b23e77b6a3c8bc24c85a836d00e941f8e96919a0578cc3d9ea40b1592c398cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "f1e9571cf6b9a4e8c209f22c80425a371169574bfba4b3702810fef2cd9f658b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "dba5fc304ab78853c2fc03aa9fd565ef2d509773c3f5f3440f7642447aea2156"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "d1a27aeaa979b956c8ccba7f540c2c8626552dae9c7b23a356925562bf5e1500"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "756ccc960157550a86cd924c7237887014376ab83a70db376a04e953093f3761"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "12bcab901249d5ed5983168f4c0d5e20bb9d8fef53d872a54b04ea25f2480a28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "da6b9e381cc427847914c971f7afd1eae439b08022ef02fdac5bfbc03c1628c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "e6482f2f2f9fa2672438ef291050f773e575b0043514cbc88076a9499b78cef2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "e3cc33fc225fcd1c768a1e334db896926399d12bec698113c756e92492bcdc37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "784c12f9be72d274fa344c62c209e916cc7436da683f7f50d32ffbbd8de0e4c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "d40d142f31ed613200ce3d8f10bd0a160a8b78051629d932d38f232919c85605"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "83819fecf7ab614b87408e941d3dab66f2a62a5a5e6ecc3d6388a9eba06f658e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "1fbfe0a146422a5d6c65e09b3cd32611a52b01f7ca677ac83be3229da33fe047"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "f36eb8f39a3eafe1cfb901d1ad55c56e390e522e20e154343f7d1afad4add93f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "7c3cb22f2a9e474db5c8e703aa57e0553fe1297a0dd575d8b22302ae0218da40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "143337cd8e9ff7100f7c06d8b70526b55562eff11cd315b20c954211c309e1b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "f7b03a7c386c57eedc5be548510f8719d94906def67ef26945e8d963af3cdd3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "216950a8964ea8c0e47f5969d85d72aad49ce593c852718a170c642ce2ff7186"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "2d51c82b34a3bf3f930998e67887249347cce5415eea0b56ff231f548e95ec95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "102260db25bfb9f5dd8c8d8deae62a1282a04e3ab39bf994b213ce0ec9f78153"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "8ac2ecdc227b43e2ae43f42faa9e870fc05a5d5ac754b520c38eea7d10dbbba7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "e9ae2e66f8c8128fb774ab74595298a7b9ccfa3d5420907d73a669c09881221f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "9c4d65f285a28b7f618fdb1571f754873c66ae684889aae18a8b2261597bd4f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "7d3158e8ddf68abd43456e7bdcdb6683965a188a53296dc27232420c71fda3db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "bf75f2f86d3702ddb4a7e8569067dc99d98860b1dc68da2e4f1cc636ec45b010"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "db12b1d65eeaad4c996075b355792ab0b32b90d848211b09a5e33d4df5bed342"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "98547d9face58fc17492aa731af46427d418130003eb61c256bcd28b29fda033"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "59b1e14b817df2e105e8b74f7db8c94b4f785d0319f19c4a430412d1e81bf6ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "8de0f795e13a2e8980973579068f936a63af5fa0691f27846da32df6fd78e8df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "d0e783478735cf6c5837c2e9067fdf097e26faa9f1f87f5b90205243cb766927"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "a8d54d15999a272227156387393c52dbfffd5d8fb5b11741de8d35138b3a9097"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "676e9a2779187c284e563b18c83bafd315d825c81b365125a2ee2d46bc960542"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "7c905d821e8169b77d93d8da852367a7a5adca8e222daa16dc21cb0f8057bcda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "1655a79a405ccc4839ab5320e6822b60323b0b817461ae7d7b86ee9b845f1cac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "a67d4e90de512f5e8ec536b5299a259be92db5a0a35ece01a3d89c761624569f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "1aa30b3e68ca48e361eadb8379641dc542e1e2565f1cb6adf0965bcf5b554eb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "463e47e4063e72acb508d4da8ef040eadd1bf52906f0ba38d0c7ce7c7a1345fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "a6b35e2c60eecf6772bc735c187ae1f319154bdd5417181b0b1a6ea43ac62cbb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "55a82e35bc0656cffa3c9ca2fcfc4d93113a548d2a9e1e70d49f31fbdeaf8b04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "17bdf5845a15cf7c9c6a556840c0f9b3944d7df16c3f102c7ed875f1f42c5e65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "186fcf000fa0284c383c824af8625b3927e5dcfdd200b1ad2588394adc98b54d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "59d918ba842252bd244adb247a4e02a69c89a58af8b3e8f49611aae85aac8072"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "b59d562a13159b4c81a739f429626d8755bf6088d81d6031fa5b246961686627"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "7f7482ea21f0873dc13f65864610cd4cf7456c0a92b36af26a7e26fea2421855"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "bd8a6dd9a8cda46f8c3860f32e0d2fd0430c650e5cf1ff1f3fe794b2be601d11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "9ca19355c9224f2b3531561b8ede85af7782b70101f297c84c8c27e3160ec26a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "b531d95b2a4d403c6113582b1bed3b71fbd6694fba4ee2e8b0817df9910cc644"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "a30bed601354ae1eeb7c238c8f8c3e0d7364d95f28c9bd33464c4ae58172a275"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "2179d37b4910ccdb7047962a4108fafe273f76189f61ac5df373efbe8ed42092"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "175c7186a539ba18cc55b9b2cd1f5e83b6d1da4b87ce636c9414facfa58bc289"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "0f2cd82625553e3d49ff3c7dfb3ec40938469ee99ed210abcd499c389fcd41c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "89cf406364921c26acfef7288a4b3d77026daa997003c93deb11296d8138059e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "8a3ba8044d1392efd089633fe932e167f05b11ca5d038bbffaf9a05677d7bca9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "f254dcc9a216a4b05756229a3710a1d9553265afc8c18a3cfa55de8ac8becd55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "89ee0207f9be84ba331768256dcc0295eeea2428505cf0d69d02721d720c592a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "20a45d21564a0e187b1218732f575e6d48bd5f2f024a791811ac13acbd99c200"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "5e1ac9387650a186ec09355a8a5780e358eecc7bdcc0dadc7af080af8f3ce05b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "efb929026c39064d3ae601faedbc6f6df244ec2d81ed36665409ff2e7667e076"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "7727960cc13546ad996440d8c72d79078535aa1496d11b645de5f71a8eb25b43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "83b1cef418676d02d00ebd98e558ad31a1f85996b4cf48fa184d17b7dcc3e64f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "037d5eccff56e310ccfa135a6f163d235ef3a7fdd731df2e52b1e276581c7855"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "50e9d90e4d019eb0c2e31067b9effe1d9c8418b57f83cf14b5dc76f153d31ee0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "b1f9192a607590151f7dbfdd9e57d06ac89ae3f68ea6c5f9ec2381ab4333ebdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "f5d55223ee8ef9b91f75697ca984c61bcb40653b5e3fa9922f16881221361a6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "dedf4f67a3f24ab1652842d8a2f0499c2d7bec7986fffc6ec8e386432b7ef705"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "7446e588ba2e3bb5112a9f4111d4330c63842288b230e79f5b9249bd0a48095a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "3ef54c6322c72eb054bd06cbbe005b42095611562556128a65439644d325a40c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "96ac1989d716146cbe59862b113c0a81ee2540d19d161e5bdd4e1a7401993987"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "1dd8da4c20078d4bafcf2b400293362c575c1e5fb1939eaabb6d4b1c0a01504a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "e3f002c295561a3713108b79e6ac8a6cb469bb34bc50a6d45452560cc664ff02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "73eccdce7db1609fe3ddac5bc1e129d22105121333b4036d9788a62e8487b489"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "dedbd9747f438ea87267cbc23b2a08a9808e538ef29c21643d8631949465a3c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "7a0e5bfa7df12af79569d1e5e3af4b09969dbad439558da19183f00a2efdcc20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "73205b29b1c6ee43fa4308769b66088337e7d4f32f11a8dfe3df392b545e4bb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "a736ae1a0a3f86bae68f993a0f7e72d633a5c35ff1edf8a88c10c40bbcdc359d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 163056, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "5d17db017be871d49ba4d988c25591546fdc5b9986eb0936cbb7994319e5c86f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "0f182de58c101a466a1663a2f80fc77bd498d3adf1cb07f020725035747c4616"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157424, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "99431dfdaf52fbfcda2e69ed5255a3d478a89dc28edc7cc12610dadd92aa25c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "64ad0f8baab9eed04119339d0c1205a2a5bf7b2fd3792668920c63b92342f217"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "fd28cb6bc0ac0ea08afae4d45bcfb9c3cd1ea942b18075f4646f3fc26f4055bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "b261db6329a9fba9fed5ebc1131c616aa78e48f2f6a20368fbe1be3e6fe63af7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "bc821a96666be15ffe542df45cd6da1e7f38f2a548183e89665ba3cd3cdd6ad6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "22e32af15d538330e95abb3a391149d376d2f01ebce83dd7bf4707278e80d624"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "ad582e128e8075596992733f5aba9b315ec65d14ed199b15585ff06f6a0175b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "89b8d934ccd1b57a11dd01c75642ebe5d0af9dc2682d9a901015f8842ed90fbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 163056, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "f54ce355d0b4263e948cbab963dc091844343b8ff1f9e1a2c6c2bf4d3d49164e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "a2176dcef38487104444f10fef495b55565ed4a6a2b8deb876efd3a235b69aae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157424, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "260da146d5b380de241130de0a4f6119e8ab1da7552fd764a15a51c2cf9f743a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "371632b2b033fa6dea4d875e17c1b5edb4c74001fa805608c203db7917914431"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "b161c32017738a954642deed3ac88ea004da3bd6d090ae9d5839e0d5402a577d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "6c8c947fafa4d78b40df1ca46cfd2246fa5bedd32bfeeac73b86d01ad6c3f002"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "18122d4300cdb4d4ca9b0df38076bae4503f9eee2d9cdde19af9a8fb9a9844a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "aa9826e58b575c9612cb09f149fa011ff57aed1ecd69de0b464e0d1d3e88713d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "d0a465b1b16213959a402e8e72254178e056d57061eac591323280e27bb6b55c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "98eb915f3ffa08aeb5ba40e6102f25bd3c6e03c2932e8d68ef73cf43449a7f17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 163056, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "616442da3f0ee2d6bc46d1240fdb9f590f5848ab3e6845bd7e845f3d72e96bcf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "72fe0c488bed21d15705b7f2c7af63143a16965d8433f61b7fe41edd2ddd71d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157424, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "1aa884901736b459f4eba77d2b33343e0a097b7bb7c4d26ae6a2632f8c75c4b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "7f771a2190b29bbc61b18b0b1e6c1ab5fd9893eca485218470bed6ff98365a04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "b921848a015ad9b8e967c80d13283a0f544a191be865167d145d5948d8a9b705"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "291cdc1aaf072b0e0de834899db9aa03a72087b5d9e3a4daabd9a7cec9970d4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "3e154c265169ad20e33c300a1da54dd8ab2557c14dfc2dd73579c1233f0369be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "e742a5c2379badb9590c3284bdd75bfe034b3c9169518c61dcc357abdbb84c51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "3bec2a9657967611aceda91efc8ff59b13eaf7ec008a00262173c0c35ac9e0f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "090b0b9ac776daecd6354bdc18ddbd61b7d1a2ee66b7baf1333dcaf3a81a97b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 163056, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "06699f84f9ea1a8d5182dbfe8a7083e46aeea0f028224504d00aa400d013a75d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "1fbdbc6d9b921b93b228da08030fcceefb704458965f188acbfaa5b66034b539"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157424, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "791a4e34fcda4278c4c4759f70274d2324afa2ee680c0145b85a4bd900e18f41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "508749fa4291d56c5bd7a8482d443f353d63e9688288f966c31bd02a56e47e39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "dbe78c8bfc00e66f86b1dc5a5218c93e5eb4c1643d7681bbe3f40e1a59818059"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "aeaba2b45bcda86d978dc50260b47d1a1d059fdfe24c16fc5e26d71f89302d76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "12c596690a7dc7cc3bba2d57a24aa6b7739112d41e3e13c1891fe4b685d08d78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "9124880589fae357ee26406904350a46eccb36d94bc76e165165d9469aa898f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "30f53fe3f558e787dfc22dc6ade4014268d5d4211ba5dee7b40e726614c489b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "9ea75bef557e72601f1902767a4d703d3e98b02ef2a5d7691b0ab6fd1cf9bf63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "27b9d9b5601b045ca73ffe3a6c3d2e3882a67a707a8a06fc62ccf39503188d99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "b2e58b88c1a6e9488cafc1629300d8295c67088a8e5e22065149f3ac716075b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "37e93b1da2389bb43c13cdc3a089368b5cda173f021d539c794ff6f09f68ab7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "9d0ca12333ab1fe90ba6092cb9dfb373a04988a2fdc45206b3fe7c1f302c03da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "238a00394c55e58381f9ae09964d1bc59587aa9b2db2a892df95bf41b348207e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "f1d8480c9ee8134cab89aa19ac428766d0299a5ffb1e09c18cc35370646737ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "86a90fad309da11dd8bd8c95a7ff2cfd65cc8a02de91759d3072cc24420f1ad8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "2ac2836e6d93e703a72f1948b03c1aa3e2c4006513b33be567ec695ba7f4dfc7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "de441120c328dd0dafb881c9b5ecb014cab13190853494e07e3aaabd34358b99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "bc591d3129743f95bee05d62783445d8d079d91ac1aa7c6e04e216989f4192dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "29572e25ffbe0c5c40a6f5029870f3eec0b8968e4a3be3e194c74226018053dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "2cb8893570cfaf12436f8b6288943a38d8f162c5c203b7c19dee5dce6afbc71d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "77a3b312f95a148a29477692b53014b197bf0571ad83ce2ca30845042341d8d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e81cd62026a3b956ddd11953bc80324e40cdd03680613564bf7e2ac4aeaaa02f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "7dd36beadcc78b601ec1a6bec3fdbf7fc26e7d5c5245e07c7800e19f6c4d0ae1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "acf5c69ba59fc843da5abb02f542bf115d2aaa962b3c854826a8e945842c0e5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "fc5e9329de6f7993347b79ec1999df1ce18bdb4ddafba81f9171db87ff45a97a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "bb6b68d2fa8a08b461e274dea844184b576dbeab2d7ef4ce3e64215286a0bca7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "400c13ff122532f508522b7c5b4b553d5c0c07eb7ec00194fa73767ea457d4c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c8d9f2e3475b2c5a9e10fd04116d4f364ccc0636023f699f297111a7aa00d215"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "141ae7f9a6b6086899b7005b409e4309574ec17e5bb7cea466f6f89f17011603"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "3dfcaf58e7a5796fc8ba710fa80181dd9c1667457fc8968662ac66fca7b3ca68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "9a3a69f9a3ce78cb0dc798522eb495dcc86c5432ecb5ea922440d1d89a3d77a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "df382a84e1cc8c7c34afb222189f5da9fa1c8ecc6e523cc314405376084b44d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "797eb888d3f43746a9636be08799ab015b7c53418b918e6d7411a49ca3bdcd16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "9c5cbc7c6679193446d93a80485c573f12fcd188d76fefe962d65ce2acb3f086"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "e75abb66088824e4768fd191dbf6f59de635d40ace07d3695733507411b29f5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "29aaac613e8f48335f72ed1dc57bac2fb28971efda8212ee20f8ec01545fe77b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "295ea1d963b71fbfe2de7f95a8cf90414dfa82099b9778c11865569629e7dd9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "9d0cd374e8d1ca48850543b64761f4e1b1f2a01fcb428e386054a4a84118c67c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "628244e4795adbb575b329f224d9a35b7c72863d5f5c7ab6c9fafd461dfb9e07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "629e75306cd53a2b2cdd4ab9cb385ad086a33274c52e13c44687ae6a787aa981"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "7dc61cb57c34ed93109289c330aafbe688ebb8d4bd2567a3275a2cfa1f81c94d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "447bf868ef724bec1d3e65879bb0a26cf0565e3f7896a004e47f234ec858a572"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1a41a5640c70a0d0be22e7061c076934e27b58835db4e10183a206dcb14f137c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "50e5cd0634fec0fef1fc1af05a414063ef21bbd0bb29952ff7b683f06b80b3e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "d69ef8842a66639b230534ab7cd87c96451742f914582a46b0e3ab8d2edaba57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "10beb57541ff787777247192f9f3dc7d37aedb4d3c566e88ca8d5e52a74134ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "37bd777636915245fa7f864ec1145a2536af831153ab35d56e4224814d5a16c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "cd90333f0db8abc550fe6f3d6ea90ef77d5a5243a3ede388b86403ce12059758"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "45e1b7cd145f8f99e9c6fb37f6265137d82dd19e194cf7e21db1ef6ef86a125c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "b5d81db041cda2b249f5a5234e30c73fc9e7cc4d34824f82a73e029615f761cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "64dfe8432df9a2c1ac513ce164284b6198921f4d8fd362bc5775e5316133fe34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "1e59c5144abbbdbb8baa203ba9e25b00f38bc5203452dc5983d9b980ec0db3fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "0376e462f9d28b3ab425452581a5b840be6bbe315cd29fec89989d311e76c49d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "0859ae893a2e433199684bd33805ec6bb09ad1017f4d74d139bd97fcb672dc43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "cb41836345f3177b23e1452e632ff228b49e49d49517833cc32aa335accbd3a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "9b8ff1e86fa9fecedd82426ff9a383c000bd73f94a02fcb763d6f6fcbbb61fb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "603b29907c5b664a43ca2a6ae8143e7e0c03ca0a303ffede61fa2684edf88a90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "4c498b7aeb470196b3c255e2ca483132a2be0d5cfdb7852310a6e92d62841ece"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "6404c6a6bfefaf2fee9cbb56566ea60c2bea5b4629beb3a8f25b90e2e5e61ad7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "f357325aead22ed440747a21ad8dcc28d9e20352f6adc5c02f41f7b1c81bfbd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "ee3d9826ebe520c3ce11865e78499853ed09b1ecceb0dcd193bd34b5bb4c9e1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "5d439e1d5e2614fcf239d6cdafcf0d1e667c1fae673806c6b0f745b10e163101"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "62e9a052c8ad3fcceca9a8d64a5336075b1981bd97d554a35c7725cb8737023b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "d11a3ba0fd96a1aebbe73a6c7dfce63e4cfa27128d31e6778f76914e263e38be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 168048, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "5ac175d87258c3f27329fe725d3b58b54e3492de1f6c0d90c6bc46d90c78ea61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "e7a85958ef99b11542f690a6456a312d590df410ed5668767192e0f35f3f75a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "bba4a7fbe8b6d6d400242b24184529846279b563a32d43568af0880ad976d99f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "74596558c1665216d65eab9381dd4dddea9de70d0e22581096ad51b630b73cb1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 168048, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "93a4236c8c0f1cac623df702b9e84af9cece71db3e81d485c7a5a7bf9c526f2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "37c484612cd2ef8be818d37ff3a0cdbe2b01b4fce4f5084dc2ce124c8b05b2a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "27f09d1e40a1fe762dfb8034183075fc53d06ad0f08617a0664cc9f373b51c43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "8fa470502efe73b50d60ab59222a6669cd355107d9595d70ff71a548360a5377"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "ee53db44d490cd8501d983407ff4a6726b44dd4e16626816aaff8d554b2a75e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "f9c88fbe9fe3b3fbf002b47b58bcc8ffbb97457d054e7d99ae32925f91b836ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "861ef50c986c66d0c5bce5d9aab90e6e0e60d2cf7fe885e01da4122260011491"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "0aa6f2447308f8aade1cc8adae12d024f7473f8d489aa760803a4fe55a3e6afa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 159984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "afbaad8bc896175b6317ca2b28ed8a4331773c2cd1ae0c52fc1ddc12fa5aacfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "a78a605c07edcf61615ce2caa9b706cff97d04383e893b81da592b6b3b0090be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155888, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "3c82f97eba6f0dd1c747dd6b88f0330f53fe143a0800ff66f16b02383107e26f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "179e23177df77ed663d883cea56f39b93a8d811dec6d073d751ba70eb167f449"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "ff19045533eedfd58e79ef9e799fe187f00f0bb03e0b28a589a0a949f7e0a521"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "fe9e4760c4b41f0500300bca74b30ec29bfe59cbfcfa5566542e0fbb765015f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "358c3f56a3087135c0af2ac80df962d421c07d286900879ac3af14a44e033117"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "18b777079ca2fa0d9f3ba8b16ac5422b5ab529b972fb60c2cb43cec0775cafb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "6db9dcbca2ffc13c97a610aab30f643a4904e7be1dcb46553418a8c198844ca6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "038fd866cb99776ad266836b902b42d7df966a1a262a4ebf253ff5a4f8f85ca5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 159984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "32fc0eef8ec179ad3f8bc936311a964c24664c74364207f66451432635273113"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "f46a309a6fd1e3a2b9970f6e53ee8ac8367fe24bfff1cb98869227a740627d1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155888, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "20291357beb19104c1330c611fc8fc768a465dab2ca8ae3d26c9714e75ed339e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "63b811e553c018ced48d425d6ea4e3f4d69e98efbaecb577bf5820fb87a1411b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "ac62b41f621e870455ddb4d02784e669879eca1bdc46d43dd0c676d8f431f1f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "4dda04786dbd33a0c8766c61411c079b484ed4bc9651c0738140d31844e74298"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "a47a73f7b85d38e57ab9801fec58f51c488de757e50a88cf0b0a695cb2c67b8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "ae596ead3ab9dad84714b49ed33609c1afcfe3af87263f77b562e6374515402e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "8331082faebe918fbedb59198d1a5ccbd84a5980a0f6839d58f0004ee3432c57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "8b01914d5ff90e8fd126179e2b33706381c52166ef218a9c598dc60c4942a2c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 159984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "f7e54eddd0efecb037400be127ac0e3301d2beda1042a3d49f7ac9119d4e3aa4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "ce01d95e1a6a7aa2bbe3f18c8110bfbf4c597d3e6a7e8598ea6ce4ca49191fe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155888, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "e425f3af3438718b8f32a2396a60b5f6bdb4937846561601d62c994684b553bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "004931d53f28e2d589492e47f7e6fe3bda8ada379d62bd8edcf4ffb250032d0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "0423430ed3fce8cc636d13358e19a0c86da51b41a8c101955537f89ae2facc1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "c8b9dcd22c05dbf254e9a7f6cd22309fffdf48a25ec4f0becc13721f1bff02ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "881bdfc0eabe595f4275d208d1105304c6a48bf1ca3f067646f90433da6bf47c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "5dd9a88ec17805aa1323da7e39794466556199253bf9a18b458179c0d2aaee8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "88ed4864951f210a4297eefce768ffa4f20432091c3aeb7b77a50a9b1450d3c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "2466cc9d880b0f1ad7bc776723abf9cf3c2f02f869a583ab8073d3e068a3dd01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 159984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "41c1051931f8c14c73dfa389e378f8f6e40ca6819243de9acd43e3acf16a07d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "af3a2076890791bd4516791ca0d62ffe07e387af0ab77464f2edf8dff1b6f715"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155888, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "225962b94e0ef8dbcaef23c59d19c133ae13573e71dd55f6c466b5f9a1fad43b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "5c44a5c069c72cfb340ca18891e115a779ce1d71a512909363c58ba0682e1fcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "952cb5e9bdab0f368a65539ffcb259d14a89aac54953372bf23e95444a57dbbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "fa8067a415328e6e95c43dda1c4e59fbdf6fa2a9dbb9b409290b3f5a9a9718b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "1d6040c85ba3d850d9064f1f0c8b6b501a7066443d214b7da8738e3bdc689104"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "4267f54b868c311bdb2d19711caf1f0ebe4a46a305cd546ac8a4713ce23a950f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "fe8fed65d0f7cdd1edf11e35a8a623786056e4da9329e4fdcd61a4989ca5dab9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "8f837d5a28b6e867a1b3590f7423cbafe21116db1f277dde8d81b46831faea21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "b4b888a912d7e0ceb366bbfa1c63f014d478f7b7564e20ad77acdab05b3be78a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "133e51fd09685df9b058537d2e0aa0e33aaebac6f97110698eb0cbe8655dbd57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "852a26a3b4b30089ed4ca83830f4110eac799ec98dfad7eedda120395f589a72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "161ac9c8df58050a14b17b7f5f9dfd01c073a9cabbc615e828b44e6f5ee7c9e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "123326dd4f8631fbff3c08e553f3d8f2452875f55aaa532e3aabfa99efa34c22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "1f1b3aa468a35f321970bd189cfccc46a250df483a8775c524e6389a4e64d32d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "122a2a3ccbba2ecdb99c42af69790bbd3c534e39a56671b436f49bee4be3b658"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "ea82554bba2c687d0735092011495a246348ee705006f6e45b832cc436291a2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "541125bac97163551159d5d5466909840450bb451dbac6ec87c897c2c140ebef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "c3b60df4237ef1fa97642b16137a9e5903ba46d12288c6de1e8aecd15e7a484e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "cb0befe4cb3ca434bdd8282c8296a8b3bad3747f45df9842bb12e30e27fe490b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "367c6214b5e42c7bb4b296668a225c79ea522aaad290abf29c13f2186f39f588"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "488354bbfd41dde44efccdd6eb2a06ecb0d0d808c1e80fa9942a9e430a0e9919"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "a7c0dd0e53c0a8fefe290bc692c5082f03064c656c931441bcf93b08b383ce1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "c4d2ebfba8bf391ef7299125a56fbb7e07f05db3b82c7e31ae096098bd42e91c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "441a85a4e7c5ab80e21a2993df88d4f1d5a3236a2c5783ae3a56e0828872d8b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "f9bdda10bbd9045b9571b9d11c6076e154dc9c1c3ee37bd83cfc258a48fff3a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "f61a550d3b5d47864a6787f1ee69525180b801f8ca0a128c869e684cd0530695"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "1b8ea5e1c4460e1754ea6f93dd81f8d716b64910e1fb7352afe3493353fbc4c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "72376cc6d970d1ec9750ff9b5f0356cbe6c980166fbeff21bd654ce2efd84197"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "1661fdd56e11f43b83f57adabbab3e4ce0d2246a8dec919e21318477e9763dac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "a50ee59b8b35a9b49cb5a82ee47f646ab142e1e6d640dbbf922db79df8d2b378"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "ec9562488841f844758967122ec24363b44f6d8f874f97cde790412d7b3a7c06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "f18a988c9caf92c38ffbd26c9d4dcbd99eb05e0ad000161b23cfd8c9695ddfb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "354d58e121e4981de51c95ee24056515fc27faf23575c9983f5e326d21cf8e28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "eea13abce71a094adfae5427bdcfcffaf45721741cd5c48d985e685f879d5607"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "5ab3fb790f714c4dabd2d00b6828d6b73d6c1e34085578ca82118d3c45bcce21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "a39fd6057fcc8a409f6340bc626f4d871cf0c6ca9ca97c6fcceb639b3cc474cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "73472ffb8efcf1d5be9b6c3c25f35ba52d8d97ca1b266396c20617b50d2297a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "47a96e0714bdc08c84c3b807d1c6eea1bc21a6143075b8a181773fcbfc89aaba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "9c3f6ad5e80c3453707480e97c0e106d5b9b6802862ac23285b1790cbc6eebad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "38f405f25cd18d4e0e69fa8770efcdc2381f0166bbb989a0074a600bf297dfd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "bafbb32a521a3790a61fa4dcc161fff5424f7ecfde87af73493f35ef8b58fb6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "b198aa2068e1bf965f644f50f96f5ed437a872654f3f73573332538c122359d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "d1fe77eae31c81661272f64c67f906c356a927eebbe5ace37331148697e2f47f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "3b3e2cf3e711d455dfb02118862f09599fd686f9e3010ac5c966847588b34a0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "91f0c5297cbe315fcfe1540591e466dce475e14542c010be9ea13277b1a3d150"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "d553b368cbe477cb1ada5f6446ef691d098ad0df8e3cd3dcaa49fa8a4bf35099"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "9438e636b5561b106b8d8dacb8018f72f1a782e55f3c92ba14423c4b4378444d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "9d6470dd6c306ec43c97e0c95cdce65c2d65b7c3c4d1509b64f9ddeeeedc9754"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "9df3c41c3f0f7c84331ba2d2bfa265104c76ceba0c74e5e79e5b33af04ec548a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "033601f7e51688d90c0a55925a33602504ccd205a771789b23aae8c36baee688"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "053c963b3829b0c7903972fc71167c7ad6c79d54e37f5f1d7e8dd59b2f7799ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "fedcea5ffb4dd90cabacc506752fc3f0f6bc2944d5acf2e34433683b15ef47cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "b07b4e15e857a7909f602560045b882d33bbb18c11026a493281aabcf77816d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "e3335ea15c1f52f7a3335eeddbe60e66d7f19fe5d89d3b9b3c4f11ad1d077917"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "0a7996034ea4ee5e7dd7098ee62a433ccac7f765567e93130f1fd70c0a270223"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "18d33d45d82c64bf376a39f9146671375112aff3ade588321d3dc16ace1e1fcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "74fa638eb2414b1d086c746c58f30bedae3cbcc3ab571da82441493eaf8eb7a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 158864, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "6e5fe82d89f937e9df986bd99d5369150f798f73cd5ab80a7110af03eea1e47d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 158448, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "a00911e5953069ef86bf6ebc72ac57066d27384c34743881619ebd63cb739c8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155280, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "90f4f1d9356c04c4dc01b8869ce08fc9dccc835b4e7849b2a8393d85587a1910"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "1efb820b38c74bc7acff0481cb27bdc908f45705376176400343faa2afe42bea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "ee58cba133ec6c4b188dcef3fa207a610721efc3f880b204963bacddbd5a5fe2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "f36b619916b290370d8423f3342dcd50c687c64629b55a671adc1235271f9d6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "7f72697b3668553cd74c7924c6c69cec624d39ca519717c3d95593e84c1c5447"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 216288, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "b3d5d8731a728be419243756ca82b763414e33ec98d258327268e54cd9ccff45"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 216192, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "6f9233d892d8643ec6666e9cf8c5951f0c327d0ddf812408a2773ef38f7abaf3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "96fdc2522b03dbe0a8f1d4abbb8c3ba1206717b230c13775977ee8f96032a460"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "025a559eb1e1220c26ff7a61d9b8fb9678e8c7eb467f0daa3d8678b9c08d7d82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "f063176f9a0c6bc91db5173708ab6b8b008cfa2db33119b90d840bd1be485b8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "4ccc498a848b5116c3109c9e62b3716975f37f7968a9d47b567c235a090c0c42"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "8de3ba644d1867740a1d84809a1581c47031336663564536331d4681c3640a4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "1f43a274c781a15f19d57e4d2c269c4e38c3d3e41c512576a455d13de7bfa051"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "de1273f7b84373095200603b4d06f1b6107de5ba7556d9976e8aba8c52ad5ba8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "3c83f917538e947b1ca6150087df50fb49157742bbfecacc58ed8c6361364071"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "045ed1c98a901f49ff753c1e28324f789cad6dcff45fa843a84ceca21bf0c75b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "deb19da1b774fb13453e79f1cacabcf2d6e3733256eb2a66a02aa424249c5f29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "bcbb8c27bb998f534ba8ec0875f3f9b644f4e2f7cd5b2711a3dc4a143c9f2200"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "966dc4e5225c5db1888f93ee05e3bafb9c0550d294ac9afe412c226ea46b6e78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "c56efbfd1786d6f62e92e660bd2ea91abe04a1670606c93f1eb8d1ccdd1fc2d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "ca8e1e4a9203cab09c7208ece09389585a55876f80c8fdd03e6d03f674231c22"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "2b3bfb7f16b71509d3ffec7f86a910e49f1edb179c3ad187b9a5ccc0b8d28d6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "deaa491bf414b33510ca34fa2563ac30d5e42169e16657290f2ecd5d43382423"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "baab22647d2861c25aef1803973aebfb100d8aab9b54b7fc6a13820947d26185"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "a223b0cf9bec9b8f83e89aa6bfe52b88a05488f440db05ccb4a0dbee2f055dc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "90ae18c53492146c7b5263715cd60ff7d88b3feb6b80f35788c0d15655487f71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "4b86fb80989b2b134922aea6279b6eea1c88dfd11304c77cc89319a6d01aa5dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "cd5c23f2a85aaa721ad093522cda5bebb38ea33a76e5c8391d63a6d045bc33af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "6acdc827675eb64960dcd6360aac9a02ee8889c3a45267b0b8e7395a38c5009e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "aa6f2bf03a32288250c7b3816c4802cd2868b6a2f7161f13e0f2b628dae02cc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "7535d559e0e01fbbd6a905ead98e0fe88ab57d0e77d9b1e702d753c37701ef99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "5f5ce7ab7f8da85cbe076f1fd19b5c7df4a4df12fbc051b994cdc4d566645642"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "bf2a87be2619e52a8cf4fafd0daf1f3b8007dab56b6de0506f88c9215d0353f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "efbc15cfa3dac4109cda1141c4857e540c082a30b62cee10341785fc321a1d86"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "8e1cacbef4868e83003df0250a770add7fcef004c1284597c37057b6a2c0a918"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "4d53efe846a551d13f4b25189c13405f4d7476212db1b180e40f0598d38dcc82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 217088, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "1d0c5db785330400ed6f613b3ad1ecdfe873b214923002fc2c3eae56a41f8b31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "6a53606add75e32fa2ea42a6a1b9ccfe1a97257419e0f6e56c465f671e628fbf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "a2149b3f9c44861f53c8684582e8316a48af3b43b7bdea772da64e16ece25419"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "e408e4aeb1fbce25dcfca13030c626bc48e6b4cb8102ec6e14a40755f8b26262"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "2334284b8d102023872891199ffab6da2295538832c4ec3350379c808224f3b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 194256, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "9e92eade0baa5159a84d760e6d20b8ff2da0a0d41250d1d639a1710689d544d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189648, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "02652d2e5035a61afe83071747eb3816fd278f83f14e18742bed7cd412078a61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "576087f4dcc0e294c82ebc4d4b238ae4293288dde8a12da54fd8c08fa96e743d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "394831d100dbcde759f29898a3ef9a3a1baf0bebabbce026a9077f6de02952f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 217184, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "09f74b809dd79c3cf57ff5553492ab19b7c977e6299e27dddd37c40498d02ad5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 217088, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "782d642592cc37b30dc1da3e07586ed9716b59d8f7139ddcb3c292ebc9dc3375"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "77c9060906819855a2a6b291d53edc7f20b5e1000473df44d8ff28802d378690"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 160912, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "b6d3b81fc2049f1520a0e9c04120ca46b367770b47ad2b8968eafe8523393d1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 160496, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "6ce56ccdbdc9f27a5512a3bbb48e49d408a5e9bbb7757f801403aaaa03585155"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 156304, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "52c224dd7e401c9343ea28842157519fc5ccc21c5d26a2fa5bb7a12ca226b44d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "ac77a4c4f0a61ab6903555c999caf72d33632c379fcf5d97b8c863ac29c1dabd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "b36ec76b8181511ca2d688283f43f423d5554131875ff1621bd8e4f6c3c64069"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "62b8239cd32dd54afe846756fe0863cb0e0b9ac3c5514f1f194a3f974ef4a204"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "14263c8345d8547d9324710b1de11ea2842481d645a75eaa23d729c439e56fb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 44272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "65aa246c2c4987820c878d73dc6918ac688c5b130d8b21ce530f07fd13a5a7df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 44176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "78167e3c49edfd9bb4956517506f072bdd0b17d3c47a1105f2b7f17710014137"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "e6e175028b13ef72ac1467064d83eb34ead834a910d42ad69c85d2bb041635f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "6b0e8b8781aed7c267d37973e45b250047a7fcbe9dad6a8488bb8eb5c0ebb2dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "a41b33ee9e7afa64db892f504ad9972bb8bc7d5d7b393f8911e943d81d47f8f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "07f83f707c8060de7b0dcf8e0c22771e4bb9cd89e8bea18eb61024354168e214"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "9873d5eb5b6a4a8928cea6274e0a2dc799be7a0c1a508cc19bf2d25c11205793"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "3412bd842ad13185b3c4567dab07595fc6a5b20a164f84cda6b6b91a2facd352"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 168048, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "c96f0019cec5201014355b5ea50ee423b23af455792845403d0e157de78fc97e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "72108951acee9e0ba9aaebf82608d5852e7e0b9411da5245d9f1303ea1fa3028"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 194640, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "448800084e2899c97a0432d11d0169591f8a3b8d5d84297cc5fa3c5edcdbba6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 159776, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "cfe7af791b383f9ab0cc44db6bcdc298fc813479429fbe52d257bcd9d63ee31f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 168048, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "2d8009bc8881bcee7b3e97cf7a05afe417d727b93f85ff6d5545720da4d7639b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 159760, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "cc24f152b4ef75d4c6d7d8acfe4f15a2b802564c6db0156897a8d9518b256637"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "7d43be870916ea445081fe5b9c8befa9e2ce68c25b4f8fe623748f83e2117c34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "6ff592f44e49f4bfc34dd79ac4030cc3e7763f606a2ddf7fcd59e76aa191cd24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "72dadc7d8c237782723a9ab23e7a3c3a715a02a01c0737f2f3a1c04d2f3d1b37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "6ef0a45e49ff13f7753a8232ca395bae164ff1b6a5ae68149ee0bfec40cbc6fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "a5aa79dcd95089cd10a21f0097532c8c066ff749fdef8471d15d54c59109af30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "6c7b89d6ed454d6257db7d4ea7483377905368ecc986ca0fd5e49cb80fa4e5fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "d1575aacc3503437a184d8de3fc48783bef8636f29ff1282cc9f3a28278ed6b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "c2145d16e3ad358e29bfa11874c36c76a015d117ec0e349706c569a41921d7d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "84ec2022bccd7a20db23015617bbb77671b187c890619be931d74b8bef1e8a3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "84a0502d90466612a67d01981b4f817f6b322adaaadade323b2fabba9d73132f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "adf8bc7d1f0466c84fc8e0ecfce3bd6716b20e9c612f18c3680061a8b8294514"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "415eee723c9c08df396c44ef35cd4a49c7196533a021be772e651b08f7e902d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "c8ad0cea69431417f4d81a8cfadc0d7a2caf42b6253dad13f55db6ab55e9997a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "b8bc2d16ad8cf0a60ddce4d5c73c0e4e1b03a13114d3cf1c5b6a2ae2d8e8022c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "bcbb701a5fb84fec0207f6a09c0b89487d279ec7aa846055c21acefa6b9cc679"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "afc3c4af2dd1b93eea3da09276e7e79d05262871ccac48ad8c9ccc4c4bfad8a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "1c7d914b043f5b0be181a596812fe62c69e57a352efa49cae495ff6a15e333a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "7a69a58a61604356cecef05529a3655843ca5d9a430cf08a460fcfd9fd02b6c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "c720ededdf7a6017c7f996d24d3fb15c1ba151ef130bf23fe147840ad4dabdf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "4168f33ab9a09d9cd9950ffc8fc0aebad0d9a298100bd91bfb50b72ad1f67719"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "9a312f24cbc93a284eec5d7bfd7b827f74c502fff72587037a8747844194538b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "4001b87a71ca7355c45c9900c3a3d6c6ab673c291ddb3cff9aac715cf5a432c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "076bdf684054d63fa4ddcb74ebf4c015b23be15b76e02d94893d7f18671186d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "44fd90caa7d156b77579c3b7abbba2d8e2184919e3388ca4a2477b2021fdd868"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "747df5971cdd9a10207f17bab5e1ba5f0b298a9bdfd5f38e91c031d309b40384"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 45072, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "8c57599d4bbf2cfb3aaeca9e1f37e49886df58d31e2f6754ad53253a7131616f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "dd3570e0c01dd00f312eb5389effb90ebaa2777ed06e2cb9aa2680086050a5c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "18a5728ed0af19c6542232ba094f64f7d6f6ef0f899d318e943a9232600c2d3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "dd94a638bacf5c66d98453bb7912ab2fb902b353982e6de040126e8cb6619cea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "bf2bfe1a77369fb5ce117b0a32131133c9851338eaf47b436bd949fb64fd2c04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 192720, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "7d4620d24bfafb4deaa103bbbc9c253f1712c17edd715ff671a4134147df04ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189136, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "51dd8ce78939a931c0080036c18fc30b424bb627df8c20175b1577a0693436b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "cc63ae47079ffed23efe5dbdb0b6e498a463896ba164ef56c374691e5967e4c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "47974d15d9eb7aa992a0b783f383deb473bf9dfb8b265fe613cff2f733d73d90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 45168, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "6263752072d872931174064f36e627794971afc08502564d2dd37a2a533e4535"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 45072, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "236ff27b53e3155d6b5e131e0b227e52942018ef91c271a663bf0b0f1a848081"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161008, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "72ee796541a6f7f7990ae9d5578bddb0a370c7c1ad360cae9bd3b0599ce1e807"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157840, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "5560b14ab280d42fdf9ce31e0abe8b89883e6f857ac9a2921a976c01dda5613a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "8bd7a84950706a2650e669f413c2a31c4797165d9b3d40cb7cf0e794289704b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 154256, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "6031da57dc045fcc54363898819aeb5ee9c8de8442d4a9fadd725baf8bcc3203"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "021676739d84a482f01892b19d70a109ca0cd7dab29937fff8216307c5edba3c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "01025a260ac27d20f88413e57f4f98e8cb65c788896ef1faace22349628ab007"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "7d44008caf7eaebdba0209c63c9e80d4747518cca1758c1df1af240423f0a03e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "548d4f4cf118c262b2204189a6caa1b745becd7d9d9d4cb45d141135c9bca209"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 167184, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "7be50849ad8230bfda111801a37c7ef1701772d7e2e02bce1cad33aa54c98411"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 167088, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "7dc43b2bf52c417d4e9d0bca9f2c1517bb49047e65dfda7af6ef242c7ce08b64"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "7810f408e057c23b7f6834b341009c1e182a85494c4e3a845305340e26d14797"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "5a82326cbaf9db59a22ba72945a3f4c62d372626c03ac09c78162ced04e38ee7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "c251ce0f1abf9ba281ce7860a1822b59a16a5e95628e984870c14bbe4ac36d21"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "17a3cb8d12c2e7ef2d8cf1e4a994a85a6051b9ffc336c5fb3a4e9e75271b457c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "deb43ebb4b424a15e770001c121e35cba9e141d90409012a23a35555a94777c3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "de3b4253a60b93044c9b22b216258ff3eb56313ed9f112e3e3d8a1943a015385"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 200816, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "b57ab3cd0e29af8b01fdb48cd1e8b2a9969b46781bf9fd7516e3c3453f95fa88"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "8b784bd6a0fb55e5b62527380da4e6345e15eadde7f76285d7f2688b1727a5c0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 231376, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "3b89641f8d6169dd658a5502abe3a63b75d1687cc3b541990b134ce43edbf7f0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "ef89ba7faf419c4223adca287ab4638fead19fa2b6851a37e9c5b94c1f05399f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 200816, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "75f09b179bc211959fc8547c37f9bf8bc83d124c042f7416ec7e931ce2df89d6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "41ecdc3cea8d7241daa79bf3d34a4d59288c90076a07ce8dcdb09d4318ce2b99"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "dde520e5123fbbe06e2984aacd722a0b48c53d6666de44db0a5cb915d358b0e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "b52449edd3b433c30e2ecc45506920e0ff9c44e24c9c4850e0dee317f0dd7799"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "225e2f97be079b60e912a9345d33833f5c0a9bd5be0fbceb94c0284959aab009"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "01d19ec6d8395515689f8428fbace2e6a1ff885ecfb4aedd9b7f9187032abe79"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "0b477746fdde39c75dfcb6f6053ab064c1d4ed943b89b707bb428ecbbd4941e9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "9de628c5d7c4f1ef8a3d81cab16b1e7e2fa15b67eee06b9afef6f1318101e37c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "ac7058f4f3faa9e2df7831aa1e65e2c3105be29dc97f91a7cd7541c985b40111"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "3946f4e454b369030b1e8bee4dd5b8e9b6509d2132a9be1970cdc0772ccb330d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "80da0e29721fb3329517466aac6e7580638bc1c68e2d3f1aa8f2c3ed5e798602"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "c90e2b5f32e65e0595b116dca593ae1f5a471ff2e62493f6a267dd8a3745b8ec"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "d3a68ca48d58970fb543be4ada0d75b45edb8c9dbfde111c13eb429d875223e0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "f8b20987103675bb2066d233829c481ad8e8e7b062c7b1b477f2266e55b43a4a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "d19bd85c1550a33729c1f5a235458de867bac89a56adb7b74e7550fa7ecb26e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "128d820d14909ff11c3405606ef37e1351e30bc4b69849d62bcd7219470eebe3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "80262e63857b2db9f070ff0a6cfdb9c2c56cc883ce462e12f214933ef7f479f7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "c0208d0f60076f11c818c296d4ed870fe2e9bbd0c55a48542eba2e2758b9e125"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "ef3be80606bf4ece69b4e619a668f52af753fd14347b392d7732ae70ba17a497"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "52404ae2bb96d30183b12197758e6c486f56d57aed3567368d4f70c4bcc76ce3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "5158f084c5d71d0d2ad14b70841d39a87c8d1c813433fa1eca86fcfbd5d4b51e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "84ae5085ed0e46fc5bc03ffd62dbf95fa0073a27c581c0d0de1f8ba3f3ce079d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "c08601fb7a7e9681db980628527d9a0091ea328d19dff6def57a3284a43f5c9c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "5f924011cd292f89a9d816c4a3a2a1e79fb099875dceeebf34770916f91ce7b0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "4f1b34738e113870cfb598ed3b00f7b2d02264959fad8834bf113ceb7201bd19"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "65f64b5ff73b0c0897a034027919a55c09ad05c8b9d7bd2cbcf27ea8dfd33616"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "602ce13ff91a19baac129f46ac279311dbb8718e1d25c11e21d3f6ffe9229435"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 167984, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "4479549d684161065e3f26ef218fcc255cb4d817d6d0fe08e920b2f06c9101af"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "f54f21d31c29879d17d163f1c4a9ce72600d6fc3bf817090e81554f58f0bbc79"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "5b79692c4dc78b03e9a23d0f9c5d34295a4933ee3498812075dafbd000f75dca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "5352082277f8dccc34b98d9f47821198bd290804b389aee55efcd8e5c31ba143"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "1fa24a4484be31f5b7b58ac74f4f01f9de16f174dcab6471ab2c2d787ef65fb2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 182480, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "474901e52d89133cf38cbaebb9250307828df8470014df13fd49e08cbf89254a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 175824, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "164efa051330a75da4d8787c4ef18a5e370074e7fb035d5b5c866667c9e984a1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "c02a04aed1aafb062bf42f2ca2a09c6521f527eb173a2b62d33532e2c4bb55c4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "a6d1d166203299a7d5f83c5fe7036aa2b7091f8cdaf0e5234ab952e2720791b7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 168080, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "a56192d13be06690d924cc87b7d0a182c800359d872c13a51fa0f7736a2ed3cd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 167984, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "a1076e4331a1bc2c8107c3011d6a0b8f6c23af68c5ff964167d61570fdb40f19"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 156912, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "b3505111aea405d33c0cc983747371efe81a15a0d27a9b92a5c9b29c240958eb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 148624, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "0589ada29a0b8735c79f9b657c61e3b663c87b007c80571f4cf2773e9fa36fbd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 146160, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "b5bd80092ae0395d46a473e82eeb097b0a60b1b6501b43c2ef30951ad120f3ac"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "5d33aeeef43794e49653c623a9bba570f533cb27608304989acb1aaad21e6e9c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 199904, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, "f12912fb85dee190bb777451285cba714c87b0d36eb4a4d38f489039311b091a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, "76f023aa61b8b84a7488e01b9ccf149a32fbd5ed578596366d5cc7d84cc63eb3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 199904, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, "fea6c90405a48d88161de9965f41e31f357b636646ad409e0065fbf21a7b3ff1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, "51b3d22bf59d9006dc040f3ab02f799e365f2357582f6e98d0efbb7373d12d5a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 199904, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, "f8f68f0f47d44ab270e4d2fb0d73385f1598da4d0ab3ddc0f769b48816c13b83"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 199808, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, "6165c34ecb08c492e8c3f093ed80764ad99a41e82cf17c8751bc32d985f2f4a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, "d8a491d4d6f9c6442ee7e62a43f1698636d2b92863dbb95b6ad636540eca0919"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, "d597ca7226f94c9367e523e9324f4be4c8edb3f8501e7e82f66877498b24946d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, "149ed16c8b801bff19636c4d2ccb89d7200af15d0f56cccab7248573b349c98f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, "92db572ba2724b992cd345ee95cf02360dd009ef1c0a09654334f4435ac35b8a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "00b7076d158ad836c1e3a9e078c49cebb2a8449a60c1a14c90ebd3d21b228858"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "baa808f85d49e53fae419ee23d209d361c658838390239a80a573123e0b3792b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "dd580948c6dc722396a832abe88e5633b0390e6a7ac39635a8db9ffa84bc8b25"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "dabaa640a52da2cb50753a6895742b448242fc14ac578fe53595bc60a42b4602"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, "97709535d6e9629cc4a47f5cdbaeb4f0aeddb7bcc684d98c48eb6b1b1de406c8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, "3c8ad731c2930b9c1205023a1c4adedbaea78323f70e41bab597952c438fccb0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "2f1c0ff9270f738e6ea514a6c7bbb7042273f3a862ac9a61262f11faf07b3b7b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "04b9d80353336ded6c25888332d8ee3b462b5a1cb782289e8ae3413b96bb88d6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "7e37dc78e4bcd98b4782a7ae1e42f68c140e5214d06d331bdd12adc14136d259"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "52396492331f7dc37766f004957f0d1b50ee02c7c7f11362647058a53d9bd2a3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "89f8c37756f33823296b6a586a379de6c446d394fcdf5630cb0867ae7a7a1d44"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "c693820b5172ee849b5cd794950800d75a1e2eaeb2b8b0f53e9a0b0d368bfa33"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "78c4d88696ff5692d76e0da4ad3d89af2935ea6a72529b72f1ad6a27f003b921"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "c04c573ba4d624df2e2327d8e8a9e6cd808588f0e3eb100b55b4cfc62944b958"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, "d9cf9700836673e7d07228a9af45b4e4f7db13e9b3866174acaf7042ee7f0f21"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, "1b19320042fd987bbca80cd5c4fa2687a163575efda88912b5f871014bb94bcf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "f2f564c73aae6d840e4f5712ebc8d1c7e42c6c555feba0a7b9cba20135d9b651"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "3af0a910e37a639a9237ef43e4e7d966c34d3ab0e5d329efd4771d54633358d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "990ba91a2e4c5be73f20674238dc012b8a2f9cab64ee8591f29c5291590d4fef"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "70834c87a3487d660248e09b735352cdd5b9f9bae36ae9307b3a996572151dd0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "4f62bb7d40a537a81778d6e7adee714f8158a7fbc59cb214ee3301a5febaf0cd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "7cc573b6cdfc82e5655d9ec107287830b4f76bb4914805b3eb54b148c41b844a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "45c4a87b9559303debf68f279d0365f26960a42c5eed73ccf6f397cfa8b25a78"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "acb02540b7e0c023971fa5744d3f7b53a9e6ba80a8cd9d618eef3fb5c9381d14"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, "e23390f3f9f81472d3ccfcdd9c8078b64599c070b51f1ec69a607d5502d2ec80"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 200704, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, "95fcead4b041a3763aecaa4a1b9b9369a012c769a5c080e92c75f54eaeaa12c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "55e0cc1580eceead67fced7b93b60f9f281d7ab3459659c91e551f5d7258b184"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "fbbda6d310042bd773b35520472722f4804908302065834c04e9d3f08f8ccba7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "a5270eb88d624d6fa1b722224c0e81a1e18bbd7c9990c78ed85f6eeb4e6bec97"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "73d3bb56779eda44573cf7a2758f29469fe092a8c9ce3b04ccb4c73a55f3721e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 186064, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "4542766cf60687c0c826d27737dc24d3c007a2dd525638d4d0dafdcc777c2e24"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 177360, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "a0b53672d8277c1b9709980034c4c3a013f63c7d69fc532cd026a7bf304a1950"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "fa9bbe95bc77371852535eb8ee78a7fc5abe67ab40dbcb3e166c3d94909a8f81"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "796e989b2b809f349af10754db3798cfd857440b561853e15c35a868bd84faa4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 200800, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, "f32777ed83546e889027de935097ce7c95c9cb15f6ba4a557fa3a46f1229c97c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 200704, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, "a3b101fd99585339e76c5607bdb5d1bec87f8a8368092c634eee21cc75310f7f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 165104, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "c4b645aca6c9dc094507aae019dc79cc65456dfa2fec0321a591479785559a26"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 152720, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "d485d9bde05db74fda7783fb123a9038887adb024a7810926f8ad2eaf687fbd1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 150256, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "208745c8a6bee6589779e21e3911d5cdc15f7340c0e5365e22eeb609694996dc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144016, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "143f62edda3270ad3b4599e191199524ff3a7f764f0e2576667d87b6f8a5cf59"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, "de6463a25695c4a2569d77dac62aaf52f924b6f8ee816cb58cd5596ac70527f6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, "bf6bd695692c90dcea151ce386296ee47daa7113eab1dfd246201d2d0a35b927"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, "69962b88d79f8170546b1d7831aadd58012d89134948a3dba452fa8001bd62bf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, "ed5f9976508f9485a7dda0d52cd3f684895ef1f6377eb9dc1ac220bf5d2b7ba8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 85232, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, "cb3869a4c82018ca768a44d46499c7f27a670c25a0a77fbed09aa6d67e2af277"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 85136, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, "30e30e51d698e49c77a5d3ec06148531d8084b8228ae8d9ee61b74466e01e86a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, "30b3cbc8bfdec073ed3f87a21e4dc56f9a37df4eb68a1fa42171d1b1738d031a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, "fd3baacd52b96b76cdb9aa6ad017dfcd4f89f6d1ab1f55e4bd04b43bb3cafeb4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, "601e2a82b0476aeee1d6224f9bfeba8eefb8208496714fbfc2bc4e74ffff0e1e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, "4f72a73ac3e89659a0a8419f67c276c8006521ee32d270b024b1d36dd7c79cf2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 32, 3, 3, 128, 0, 2, true, false, false, "8c74319852001b82c9af302f59be4824ad3d2fe4ede88be4ebd1ba38df8c4826"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, "b2b1020c203d3d7b1fee5fd067df9ec15a38a67529e5686ba7f7387fea880796"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, "861f5c102a8de6eb8684dc049721c939ccbbe0dfe0df923b5b5948ecc17bfab6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, "9ec0bfac97d290f3b123f91cd70bb4823f08aa0bc6f8daa2b192872950e06c2e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 202832, 512, 2, 64, 3, 3, 128, 0, 2, true, false, false, "8d7db2c2f7567bb8731f4db28e6291d0ba74aba4e10a90339da96ae8f280b398"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 167968, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, "3f9a9b2971e7cf55bc756a68230a56cf2dbc4dddb570e5f6f6fa981cd7d0a4a1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 184432, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, "6e998bd3d1491c5cb83e10aee09d79e5bf1ea13b7d8255f492679cc5c16505ec"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 167952, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, "2f29d0fbd755fae47fe3dc28926023a17a9aa02942f51f4f79b7f57ecc1baca7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 0, 2, 16, 0, 2, true, false, false, "4d0fe5850815dc59949ed998c351437442d331287c4c52c98e29c6cc7e6942bf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 0, 2, 8, 0, 2, true, false, false, "14f80157966ab4834c102ac7145b1f32da0c75d4d6c38f5b3ed34b224acb5b5f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, "d95cc5f401080ef008ceb6beea78eef6bed4d3e309b8d91810829aeb496ff34e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, "b4d919727108891adab01e7cd63cceb396eb269b2843a062b81c87e3fdbdff0d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, "0985ac695a311a4c7ec0e67d81ae613b8852b0557c605d1b11f9ba0ba3335edd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, "47f28b3c5cdf31db59501cdf9dfe5ebcdafcbba8d06a3cb129968f8e8d28ec97"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, "e5f0ef140a7c32644896f5f4206f0a54fe2bafb97d904a38484136c3562f22fd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, "ce281df71c91a45f9b79932f78048a764e33848add4066c7b669bde61eefb0e5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, "97f574cef0cb93e05c9663f639d653f9ec1e94c176ef706209aa961fa67709bf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, "e79a58e152e337631acabac206e0cbe5af8bb2e20fd65a5a4a48649a49bffce5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 0, 2, 16, 0, 2, true, false, false, "6e9030cbcfef8dc8e7501e521fba3dfe5e65bd801e03c456365a26bda9211cf7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 0, 2, 8, 0, 2, true, false, false, "c7e224351b0acc8cb8a6a46a5a4b4efdab7f74316e6e1a67ef6bbb410e8d1907"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, "790d68066962d4d8def44477958cfd93978bc4d31ef5d483321b4f1d1fe4514f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, "63b068cad2ab3efa346761df89bb253c814fe44811bb6780bd7aa510345247cc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, "9e7f6b2cc59d1d06178eb09be19cdeb1f12cb4ee7c54cd95bd2862e373432e49"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, "1013f644f043544cd63b9648922ccfe25e322ad4712e6ce613939404a1691cbb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, "bec12d24b02f4e9b68865a7729c1d9ae660dbc51ddfa40a6ac7fb4c2fe8e246f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, "f867899afe3d6aaee93df1c94ddf2c6adb40bae37af96088cee95da2b8731279"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, "062fd569594e4700c83c200d6ed4fb7727e31583a9434215d449934388603280"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, "fea509045b1468626f56334aa8c5a15051beebe551b5d5d1f6f333f4d1df71e3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 32, 2, 2, 16, 0, 2, true, false, false, "00387d7c0344035b1e012ba49c0c5d52f83f34576a47bfda1746badc49d2bec4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 32, 2, 2, 8, 0, 2, true, false, false, "cab98d63b1f35e34d106f668723292e00a2da1bc3c9435ef069631b40f3004a4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, "8860094bd4616abbdaa5c3a7ecbd4b6bf12fec24d1d1389f03c1e5cb0d422b85"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, "fcc14397a1492a42df4e48983e42914b437e3cbc9217289a011696a899130bac"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, "27283e83ec20bfcd52a6f74dc79058b49124e50ef6f95c9f79cce3976fa7b8a6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 86032, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, "1e3cd33694e584c7e7bcea95a64279fd46336cd8962f5b4bb8d4c42b74f3df9a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, "a6960450fc0631a3292f4e9132237f48bff1fe12a3e89f482e2a7d03bb0aef5b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, "193155880e9d8670b75dd16267c9315b055ce62ee72f5360f2c5a14012af497e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, "c56e116afd029f57084ba96803a33d73da283ae40ebcd87e579e0e338be0d7ce"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, "0f375b65d10d9e6c5f67cbabb955f4e459f7c90000e4e1d558765bea04da0a79"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197840, 512, 2, 64, 2, 2, 16, 0, 2, true, false, false, "1b3c6cebca25b835626427412510e3b22b8d3aebd531cd3b497a4a257c93405b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 192208, 512, 2, 64, 2, 2, 8, 0, 2, true, false, false, "0451dc9ce01cdf792ed667ca3f35978b62624939240a3a8efd8931cf452f3312"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, "7ab9f64171b530abb860643cd584728b9f1d98b946debbef6aae3a4d429749b7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, "3ac183ddaa5f361e8f40f14a3eab27fb2ecaffbb4f230fa88cc074d9ee3567d5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 86128, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, "9c6e3282a8020a687a406713c935cc3d1aab92915d19901cd408ea5e443a5387"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 86032, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, "6e721241e3894c57061020ffccd47ef1439c5259076956455191e82ddc87fa54"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 167152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, "52e14c5cce5f9a20bda258d3caf0f7c2612a7f8cd482d6fedcce3573d0ac4ff5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 162960, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, "e2ad5d5a03dafdf2f68fcca850d5d1ea418389ad20a6962214ce434b61514430"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 159472, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, "51c3ddf09918d0d08471193b470be03463e36d9ea2356b25f9b5cfa1987536a8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100Kernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 157328, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, "1d757f501d1e1110cf23932644f59c35dbb6fbe18a5f7d234d7d53fefa7c7365"}, +#endif // EXCLUDE_SM_100 }; // clang-format on } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 49bc0fd2ef..2e9964b8b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -102,9 +102,9 @@ public: int headDimPerCtaV, int headDimQk, int headDimV, int tileSizeKv, int numTokensPerPage, int maxNumHeadsQPerKvInCta, bool reuseSmemKForV, bool uses2CtaMma) const { - TLLM_CHECK_WITH_INFO((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) && (headDimPerCtaV <= 2048) - && (headDimQk <= 2048) && (headDimV <= 2048) && (numTokensPerPage <= 128), - "Expect (32 <= headDim <= 2048) && (numTokensPerPage <= 128), got headDimPerCtaV=%d, headDimQk=%d, " + TLLM_CHECK_WITH_INFO((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) && (headDimPerCtaV <= 1024) + && (headDimQk <= 1024) && (headDimV <= 1024) && (numTokensPerPage <= 128), + "Expect (32 <= headDim <= 1024) && (numTokensPerPage <= 128), got headDimPerCtaV=%d, headDimQk=%d, " "headDimV=%d, numTokensPerPage=%d", headDimPerCtaV, headDimQk, headDimV, numTokensPerPage); TLLM_CHECK_WITH_INFO(maxNumHeadsQPerKvInCta <= 128, "The maxNumHeadsQPerKvInCta <= 128 is required."); @@ -115,19 +115,19 @@ public: // Bit 8 - 11: kernelType. // Bit 12 - 15: tileScheduler. // Bit 16 - 17: multiCtasKvMode. - // Bit 18 - 24: (headDimPerCtaV >> 5). - // Bit 25 - 31: (headDimQk >> 5). - // Bit 32 - 38: (headDimV >> 5). - // Bit 39 - 40: (tileSizeKv >> 6). - // Bit 41 - 48: numTokensPerPage. + // Bit 18 - 25: (headDimPerCtaV >> 3). + // Bit 26 - 33: (headDimQk >> 3). + // Bit 34 - 41: (headDimV >> 3). + // Bit 42 - 43: (tileSizeKv >> 6). + // Bit 44 - 48: (numTokensPerPage >> 3). // Bit 49 - 56: maxNumHeadsQPerKvInCta. // Bit 57 - 57: reuseSmemKForV. // Bit 58 - 58: uses2CtaMma. return (static_cast(qkvLayout) << 0) | (static_cast(maskType) << 4) | (static_cast(kernelType) << 8) | (static_cast(scheduler) << 12) - | (static_cast(multiCtasKvMode) << 16) | (static_cast(headDimPerCtaV >> 5) << 18) - | (static_cast(headDimQk >> 5) << 25) | (static_cast(headDimV >> 5) << 32) - | (static_cast(tileSizeKv >> 6) << 39) | (static_cast(numTokensPerPage) << 41) + | (static_cast(multiCtasKvMode) << 16) | (static_cast(headDimPerCtaV >> 3) << 18) + | (static_cast(headDimQk >> 3) << 26) | (static_cast(headDimV >> 3) << 34) + | (static_cast(tileSizeKv >> 6) << 42) | (static_cast(numTokensPerPage >> 3) << 44) | (static_cast(maxNumHeadsQPerKvInCta) << 49) | (static_cast(reuseSmemKForV) << 57) | (static_cast(uses2CtaMma) << 58); } @@ -142,6 +142,17 @@ public: std::pair checkIfKernelExist(RunnerParams const& params) const { + // Some conditions to check if the kernel is supported. + // This is meant to avoid occupying unnecessary hashId bits. + if (params.mHeadDimQk % 8 != 0 || params.mHeadDimV % 8 != 0) + { + return std::make_pair(false, "HeadDimQk and HeadDimV must be divisible by 8"); + } + if (params.mNumTokensPerPage % 8 != 0) + { + return std::make_pair(false, "NumTokensPerPage must be divisible by 8"); + } + // The selectKernelParams that might be updated. SelectKernelParams selectKernelParams{params}; auto [hashId, info] = hashFromRunnerParams(params, selectKernelParams); @@ -322,7 +333,7 @@ private: { // Consider that the first tileKv might contain tokensKv that is out of the attention window. maxAttentionWindow - = std::min(params.mMaxSeqLenKv, params.mAttentionWindowSize + kernelMeta.mStepKv - 1); + = std::min(params.mMaxSeqLenKv, params.mAttentionWindowSize + kernelMeta.mTileSizeKv - 1); } else { @@ -607,12 +618,12 @@ inline TllmGenFmhaKernel const* getTllmFmhaKernels( Data_type dtypeQ, Data_type dtypeKv, Data_type dtypeOut, unsigned int sm) { -#ifndef EXCLUDE_SM_100f +#ifndef EXCLUDE_SM_100 return TllmFmhaKernelFactory::Get().getKernels(sTllmGenFmhaKernelMetaInfos, sizeof(sTllmGenFmhaKernelMetaInfos) / sizeof(sTllmGenFmhaKernelMetaInfos[0]), dtypeQ, dtypeKv, dtypeOut, sm); #else return nullptr; -#endif // EXCLUDE_SM_100f +#endif // EXCLUDE_SM_100 } } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp index bbec738dd2..9ff85d9d7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp @@ -29,13 +29,12 @@ namespace kernels //////////////////////////////////////////////////////////////////////////////////////////////////// TllmGenFmhaRunner::TllmGenFmhaRunner(Data_type dtypeQ, Data_type dtypeKv, Data_type dtypeOut) - : mSM(kSM_100) + : mSM(tensorrt_llm::common::getSMVersion()) , mDtypeQ(dtypeQ) , mDtypeKv(dtypeKv) , mDtypeOut(dtypeOut) { - auto smVer = tensorrt_llm::common::getSMVersion(); - TLLM_CHECK_WITH_INFO(smVer == kSM_100 || smVer == kSM_103, "Unsupported architecture"); + TLLM_CHECK_WITH_INFO(mSM == kSM_100, "Unsupported architecture"); TLLM_CHECK_WITH_INFO( mDtypeQ == DATA_TYPE_E4M3 || mDtypeQ == DATA_TYPE_FP16 || mDtypeQ == DATA_TYPE_BF16, "Unsupported Q data type"); TLLM_CHECK_WITH_INFO(mDtypeKv == DATA_TYPE_E4M3 || mDtypeKv == DATA_TYPE_FP16 || mDtypeKv == DATA_TYPE_BF16, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h index 50688ccba7..c9c35c228e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h @@ -198,6 +198,8 @@ struct TllmGenFmhaRunnerParams void const* kSfBasePtr; // The scaling factor pointer of V. void const* vSfBasePtr; + // The attention sinks pointer (additional value per head in the denominator of the softmax). + float const* attentionSinksPtr; // The custom mask ptr. uint32_t const* customMaskPtr; // The packed custom mask's offsets of each sequence. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h index 7094f7c382..632ecdb97b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h @@ -64,6 +64,8 @@ struct KernelParams // The output SF pointer (used for FP4 output). void* ptrSfO; + // The attention sinks pointer (additional value per head in the denominator of the softmax). + float const* ptrAttentionSinks; // The cumulative sequence lengths for Q. int32_t const* ptrCumSeqLensQ; // The cumulative sequence lengths for K/V. @@ -138,9 +140,6 @@ struct KernelParams int32_t mNumHeadsQPerKv; // The hidden size of O. int64_t mNumHiddenEltsO; - // The number of MTP tokens per sequence. Assume that all requests have the same numMtpTokens - // without paddings. - int32_t mNumMtpTokens; // The total number of pages in the paged-kv memory pool. int32_t mNumPagesInMemPool; // The output scale for FP8 quantization. @@ -717,6 +716,7 @@ struct KernelParams options, kernelMeta.mDataTypeQ, shapeO, strideO, tileShapeO, const_cast(options.oPtr)); // Set the other kernel parameters. + params.ptrAttentionSinks = options.attentionSinksPtr; params.ptrCumSeqLensQ = options.cumSeqLensQPtr; params.ptrCumSeqLensKv = options.cumSeqLensKvPtr; @@ -772,8 +772,6 @@ struct KernelParams params.mMaxNumCtasQ = maxNumCtasQ; params.mMaxNumCtasKv = maxNumCtasKv; params.mMaxNumPagesPerSeqKv = options.mMaxNumPagesPerSeqKv; - // TODO: just use mMaxSeqLenQ for number of MTP tokens. - params.mNumMtpTokens = options.mMaxSeqLenQ; params.mSumOfSeqLensQ = options.mSumOfSeqLensQ; params.mSumOfSeqLensKv = options.mSumOfSeqLensKv; params.mBatchSize = options.mBatchSize; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp index 744294f34b..761fb475de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp @@ -113,8 +113,8 @@ void TrtllmGenGemmRunner::run(int32_t m, int32_t n, int32_t k, void const* a, fl // FIXME once we start using all-reduce in the epilogue of the gemm this can be moved elsewhere gemm.runInitBeforeWorldSync(config, gemmData, static_cast(stream)); - auto const err = gemm.run(config, workspace, gemmData, static_cast(stream), multiProcessorCount, - /*usePdl=*/true, globalTrtllmGenGemmModuleCache); + auto const err = gemm.run( + config, workspace, gemmData, static_cast(stream), multiProcessorCount, globalTrtllmGenGemmModuleCache); TLLM_CHECK_WITH_INFO(err == 0, "Error occurred when running GEMM!"); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h index adae51a36d..0ff3334a3e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h @@ -39,31 +39,6 @@ enum class AllReduceAlgo : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// -enum class MatrixLayout -{ - // K-major layout (default). [Mn, K] - MajorK = 0, - // M-major for A and N-major for B. [K, Mn] - MajorMn, - // Layout is blocked along the K dimension as seen in the diagram below. [K / blockK, Mn, blockK] - // where blockK is fixed at 128B - // - // ├────────────── K ──────────────┤ - // ┬ ┬ ├──── K block ───┤ - // │ │ │ 0 1 2 3 ║ 32 33 34 35 │ - // │ CTA0 │ 4 5 6 7 ║ 36 37 38 39 │ - // │ │ │ 8 9 10 11 ║ 40 41 42 43 │ - // │ ┴ │ 12 13 14 15 ║ 44 45 46 47 │ - // M ┬ ├────────────────║────────────────┤ - // │ │ │ 16 17 18 19 ║ 48 49 50 51 │ - // │ CTA1 │ 20 21 22 23 ║ 52 53 54 55 │ - // │ │ │ 24 25 26 27 ║ 56 57 58 59 │ - // ┴ ┴ │ 28 29 30 31 ║ 60 61 62 63 │ - BlockMajorK -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - enum class SplitK : uint32_t { // No split-k is needed. I.e. mNumSlicesForSplitK == 1. @@ -79,20 +54,6 @@ enum class SplitK : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// -enum class BiasType : uint32_t -{ - // No bias. - None = 0, - // One bias value per N of the output tensor. - M = 1, - // One bias value per row M of the output tensor. - N = 2, - // One bias value for each element of the output tensor. - Mn = 3, -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - enum class TileScheduler { // Static scheduler (Non-persistent). @@ -119,23 +80,6 @@ SPLIT_K_FUNCTION(Dsmem) //////////////////////////////////////////////////////////////////////////////////////////////////// -// Helper functions to check the Bias type. - -#define BIAS_TYPE_FUNCTION(Mode) \ - inline bool isBiasType##Mode(BiasType type) \ - { \ - return (type == BiasType::Mode); \ - } - -BIAS_TYPE_FUNCTION(None) -BIAS_TYPE_FUNCTION(N) -BIAS_TYPE_FUNCTION(M) -BIAS_TYPE_FUNCTION(Mn) - -#undef BIAS_TYPE_FUNCTION - -//////////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace gemm } // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h index 0e7b7c13cc..459d831e0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h @@ -63,10 +63,8 @@ struct GemmData { // The matrix A. The data type is controlled by options.mDtypeA. // - // When layoutA is MatrixLayout::MajorK, the shape is [M, K]. - // When LayoutA is MatrixLayout::MajorMn, the shape is [K, M]. - // When LayoutA is MatrixLayout::BlockMajorK, the shape is [K / blockK, M, blockK] where blockK - // is 128B. + // When transposeMatrixA is false, the shape is [M, K]. + // Otherwise, the shape is [K, M]. // The rightmost dimension is contiguous in memory. void const* mPtrA{nullptr}; @@ -102,10 +100,8 @@ struct GemmData // The matrix B. The data type is controlled by options.mDtypeB. // - // When layoutB is MatrixLayout::MajorK, the shape is [N, K]. - // When layoutB is MatrixLayout::MajorMn, the shape is [K, N]. - // When layoutB is MatrixLayout::BlockMajorK, the shape is [K / blockK, N, blockK] where blockK - // is 128B. + // When transposeMatrixB is true, the shape is [N, K]. + // Otherwise, the shape is [K, N]. // The rightmost dimension is contiguous in memory. void const* mPtrB{nullptr}; @@ -146,21 +142,6 @@ struct GemmData // The shape is [N] void const* mPtrPerTokenSfB{nullptr}; - // The bias applied after the GEMM. - // The bias is applied before applying the global scaling factor. I.e. - // C' = (A * B + bias') * scaleC - // scaleC = dequantA * dequantB * quantC - // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. - // - // if BiasType is N, the shape is [N]. - // The bias is broadcasted along the M dimension. - // - // if BiasType is M, the shape is [M]. - // The bias is broadcasted along the N dimension. - // - // The dtype is float32. - void const* mPtrBias{nullptr}; - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. @@ -249,7 +230,7 @@ public: // Launch the cubin from the provided config. It calls all necessary memsets for internal buffers. // Provided config must be validated with isValidConfig before the call. int32_t run(GemmConfig const& config, void* workspace, GemmData const& options, void* cudaStream, - int32_t multiProcessorCount, bool usePdl = true, + int32_t multiProcessorCount, std::optional> moduleCache = std::nullopt) const; // Initializes the buffers before the world sync. Must be called before run. @@ -407,11 +388,8 @@ bool GemmInterface::isValidConfig(GemmConfig const& config, GemmData const& data //////////////////////////////////////////////////////////////////////////////////////////////////// int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData const& data, void* cudaStream, - int32_t multiProcessorCount, bool usePdl, std::optional> moduleCache) const + int32_t multiProcessorCount, std::optional> moduleCache) const { - // Might be used. - (void) usePdl; - (void) moduleCache; // Get options from config and data. auto options = getOptionsFromConfigAndData(config, data); @@ -439,14 +417,15 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c int numTilesN = gemm::divUp(options.mN, options.mTileN); // Create kernel params. - auto kernelParams = gemm::KernelParamsSetup::setKernelParams(options, data.mInputBuffers.mPtrA, + auto kernelParams = gemm::KernelParams::setKernelParams(options, data.mInputBuffers.mPtrA, data.mInputBuffers.mPtrSfA, data.mInputBuffers.mPtrPerTokenSfA, data.mInputBuffers.mPtrB, - data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mInputBuffers.mPtrBias, - data.mOutputBuffers.mPtrC, data.mOutputBuffers.mPtrSfC, data.mOutputBuffers.mPtrMultiMemC, - (float*) data.mInputBuffers.mPtrScaleC, dSplitKSlices, data.mAllReduceBuffers.mPtrTileBars, - data.mAllReduceBuffers.mPtrMultiMemTileBars, data.mAllReduceBuffers.mPtrCompletionBars, - data.mAllReduceBuffers.mPtrMultiMemCompletionBars, dPtrSplitKCompletionBars, + data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mOutputBuffers.mPtrC, + data.mOutputBuffers.mPtrSfC, data.mOutputBuffers.mPtrMultiMemC, (float*) data.mInputBuffers.mPtrScaleC, + dSplitKSlices, data.mAllReduceBuffers.mPtrTileBars, data.mAllReduceBuffers.mPtrMultiMemTileBars, + data.mAllReduceBuffers.mPtrCompletionBars, data.mAllReduceBuffers.mPtrMultiMemCompletionBars, + dPtrSplitKCompletionBars, /* dPtrNumNonExitingCtas */ nullptr, data.mProblemDimensions.mRank, data.mProblemDimensions.mWorldSize); + // The size of the grid. std::vector grid{numTilesM, numTilesN, options.mNumSlicesForSplitK}; @@ -464,26 +443,26 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c #ifdef TLLM_GEN_EXPORT_INTERFACE CUmodule cuModule; CUfunction cuFunction; - if (moduleCache.has_value()) { ModuleCache& moduleCacheRef = moduleCache.value().get(); - // Modules are associated with a specific context, so the context is included in the key + // Modules are associated with a specific context so include the ctxId in the key CUcontext ctx; unsigned long long ctxId; cuCtxGetCurrent(&ctx); cuCtxGetId(ctx, &ctxId); - // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a - // string in decimal representation. + // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal + // representation. std::string const ctxName = std::string(reinterpret_cast(&ctxId), sizeof(unsigned long long) / sizeof(char)); std::string const funcName = std::string(config.mFunctionName); + // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision auto const moduleKey = ctxName + funcName; auto module = moduleCacheRef.find(moduleKey); - // Use cache if module is found, otherwise load and insert into cache + // Check if module exists in cache. Otherwise, load it if (module != moduleCacheRef.end()) { cuFunction = std::get<1>(module->second); @@ -513,18 +492,17 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c // Run the kernel. auto result = trtllm::gen::launchKernel((void*) &kernelParams, cudaStream, config.mSharedMemSize, cuFunction, block3, grid3, cluster3, - usePdl - && (config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA - | config.mOptions.mGridWaitForPrimaryB)); + config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA + | config.mOptions.mGridWaitForPrimaryB); + if (result != CUDA_SUCCESS) + { + return -1; + } // If a module cache has not been given, unload the module to avoid leaking if (!moduleCache.has_value()) { cuModuleUnload(cuModule); } - if (result != CUDA_SUCCESS) - { - return -1; - } #else config.mCudaRunner->run((void*) &kernelParams, (void*) cudaStream, grid); #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h index 1e3abbfeef..8ab241fc6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h @@ -91,23 +91,20 @@ struct GemmOptions GemmOptions() = default; - GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, - int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, - tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, - int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, - bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, - bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, - MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, - bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, - int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, - bool patchF2fp, bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM, + GemmOptions(AllReduceAlgo allReduceAlgo, int clusterDimX, int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, + tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool enablesEarlyExit, bool enablesDelayedEarlyExit, + bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, + bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, + bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, + KernelTraits kernelTraits, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, + int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, + int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool useShuffledMatrixA, + bool sliceK, SplitK splitK, bool transposeMatrixA, bool transposeMatrixB, bool transposeMmaOutput, int tileM, int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, - tg::SfLayout sfLayoutC, int sfReshapeFactor, TileScheduler tileScheduler) + tg::SfLayout sfLayoutC, TileScheduler tileScheduler) : mAllReduceAlgo{allReduceAlgo} - , mBiasType{biasType} - , mBlockK(blockK) , mClusterDimX{clusterDimX} , mClusterDimY{clusterDimY} , mClusterDimZ{clusterDimZ} @@ -115,8 +112,6 @@ struct GemmOptions , mDtypeA{dtypeA} , mDtypeB{dtypeB} , mDtypeC{dtypeC} - , mDtypeMmaA{dtypeMmaA} - , mDtypeMmaB{dtypeMmaB} , mEnablesEarlyExit{enablesEarlyExit} , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} @@ -133,8 +128,6 @@ struct GemmOptions , mHoistMmaTaskTryWaits{hoistMmaTaskTryWaits} , mK{k} , mKernelTraits{kernelTraits} - , mLayoutA{layoutA} - , mLayoutB{layoutB} , mM{m} , mMmaK{mmaK} , mMmaKind{mmaKind} @@ -150,10 +143,11 @@ struct GemmOptions , mNumStagesMmaAcrossWorkTile{numStagesMmaAcrossWorkTile} , mNumStagesWorkId{numStagesWorkId} , mOutputDebugTensors{outputDebugTensors} - , mPatchF2fp{patchF2fp} , mUseShuffledMatrixA{useShuffledMatrixA} , mSliceK{sliceK} , mSplitK{splitK} + , mTransposeMatrixA{transposeMatrixA} + , mTransposeMatrixB{transposeMatrixB} , mTransposeMmaOutput{transposeMmaOutput} , mTileM{tileM} , mTileN{tileN} @@ -170,17 +164,13 @@ struct GemmOptions , mSfLayoutA{sfLayoutA} , mSfLayoutB{sfLayoutB} , mSfLayoutC{sfLayoutC} - , mSfReshapeFactor{sfReshapeFactor} , mTileScheduler{tileScheduler} { } // The all-reduce algorithm. AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None}; - // The type of bias. - BiasType mBiasType{BiasType::None}; - // Block size in the K dimension - int mBlockK{-1}; + // Cluster size in X dim. int mClusterDimX{1}; // Cluster size in Y dim. @@ -195,10 +185,6 @@ struct GemmOptions tg::Dtype mDtypeB{tg::Dtype::Void}; // Data type of the outputs. tg::Dtype mDtypeC{tg::Dtype::Void}; - // Data type of the A matrix for the MMA, if different from the input type. - tg::Dtype mDtypeMmaA{tg::Dtype::Void}; - // Data type of the B matrix for the MMA, if different from the input type. - tg::Dtype mDtypeMmaB{tg::Dtype::Void}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; // Whether to enable delayed early exit to overlap @@ -239,10 +225,6 @@ struct GemmOptions int mK{16 * 16}; // Traits of the kernel. KernelTraits mKernelTraits{}; - // Layout of A matrix - MatrixLayout mLayoutA{MatrixLayout::MajorK}; - // Layout of B matrix - MatrixLayout mLayoutB{MatrixLayout::MajorK}; // The M dimension of GEMM. int mM{128 * 2}; // Size of the MMA instruction in the K dimension. @@ -277,14 +259,16 @@ struct GemmOptions int mNumStagesWorkId{3}; // Whether to output debug tensors. bool mOutputDebugTensors{false}; - // Patch float conversions. - bool mPatchF2fp{false}; // Reorder rows/cols in the A matrix for the better memory accesses in the M-major epilogue. bool mUseShuffledMatrixA{false}; // Slice-K implementation to use TileM dimension for TileK. bool mSliceK{false}; // The location of the exchange for split-K (it's None when split-K is disabled). SplitK mSplitK{SplitK::None}; + // Is A matrix in a transposed layout? M major if true, K major otherwise + bool mTransposeMatrixA{false}; + // Is B matrix in a transposed layout? K major if true, N major otherwise + bool mTransposeMatrixB{true}; // Save output of MMA in M-major format. bool mTransposeMmaOutput{false}; // M tile dimension of GEMM. @@ -319,12 +303,6 @@ struct GemmOptions tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; // Scale factors layout for C. tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; - // Number of "repeats", i.e. reshaping factor, to fold hidden dimension into SfBlock dimension. - // As result, the hidden dimension of the SF tensor must be a multiple of NumRepeats * - // numEltsPerSf * 4. This reduces the problem shape space that the kernel is able to run. - // But it reduces the number of L2 requests under the hood and potentially improves perf. - // Applies to layout 8x4 only. - int mSfReshapeFactor{1}; // Tile scheduler type. TileScheduler mTileScheduler{TileScheduler::Static}; }; @@ -354,7 +332,6 @@ struct GemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; - char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -396,10 +373,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mAllReduceAlgo=" << "gemm::AllReduceAlgo(" << static_cast(options.mAllReduceAlgo) << ")" << "," << std::endl; - ss << "mBiasType=" - << "gemm::BiasType(" << static_cast(options.mBiasType) << ")" - << "," << std::endl; - ss << "mBlockK=" << options.mBlockK << "," << std::endl; ss << "mClusterDimX=" << options.mClusterDimX << "," << std::endl; ss << "mClusterDimY=" << options.mClusterDimY << "," << std::endl; ss << "mClusterDimZ=" << options.mClusterDimZ << "," << std::endl; @@ -415,12 +388,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mDtypeC=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeC) << ")" << "," << std::endl; - ss << "mDtypeMmaA=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaA) << ")" - << "," << std::endl; - ss << "mDtypeMmaB=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" - << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; @@ -438,10 +405,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mK=" << options.mK << "," << std::endl; ss << "mKernelTraits={}" << "," << std::endl; - ss << "mLayoutA=gemm::MatrixLayout(" << static_cast(options.mLayoutA) << ")" - << "," << std::endl; - ss << "mLayoutB=gemm::MatrixLayout(" << static_cast(options.mLayoutB) << ")" - << "," << std::endl; ss << "mM=" << options.mM << "," << std::endl; ss << "mMmaK=" << options.mMmaK << "," << std::endl; ss << "mMmaKind=" @@ -459,12 +422,13 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mNumStagesMmaAcrossWorkTile=" << options.mNumStagesMmaAcrossWorkTile << "," << std::endl; ss << "mNumStagesWorkId=" << options.mNumStagesWorkId << "," << std::endl; ss << "mOutputDebugTensors=" << options.mOutputDebugTensors << "," << std::endl; - ss << "mPatchF2fp=" << options.mPatchF2fp << "," << std::endl; ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl; ss << "mSliceK=" << options.mSliceK << "," << std::endl; ss << "mSplitK=" << "gemm::SplitK(" << static_cast(options.mSplitK) << ")" << "," << std::endl; + ss << "mTransposeMatrixA=" << options.mTransposeMatrixA << "," << std::endl; + ss << "mTransposeMatrixB=" << options.mTransposeMatrixB << "," << std::endl; ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; ss << "mTileM=" << options.mTileM << "," << std::endl; ss << "mTileN=" << options.mTileN << "," << std::endl; @@ -487,7 +451,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mSfLayoutC=" << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" << "," << std::endl; - ss << "mSfReshapeFactor=" << options.mSfReshapeFactor << "," << std::endl; ss << "mTileScheduler=" << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" << std::endl; return ss.str(); @@ -527,7 +490,6 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) inline bool checkAndUpdateGemmOptions( GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) { - if (options.mDtypeB == tg::Dtype::Void) { if (updateOptions) @@ -540,98 +502,39 @@ inline bool checkAndUpdateGemmOptions( } } - // If not specified, used the input dtypes as MMA dtypes (no cast required). - if (options.mDtypeMmaA == tg::Dtype::Void) - { - if (updateOptions) - { - options.mDtypeMmaA = options.mDtypeA; - } - else - { - return false; - } - } - if (options.mDtypeMmaB == tg::Dtype::Void) - { - if (updateOptions) - { - options.mDtypeMmaB = options.mDtypeB; - } - else - { - return false; - } - } - - // Check that the A cast is supported. - // Currently, we only support {MxFp4, NvFp4} -> Bf16. - TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) - || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) - && options.mDtypeMmaA == tg::Dtype::Bfloat16) - || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), - "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); - - // Check that the B cast is supported. - // Currently, we only support Fp8 -> MxFp8. - // TODO: add same support for A (no transpose) - TLLM_CHECK_ERROR((options.mDtypeB == options.mDtypeMmaB) - || (options.mDtypeB == tg::Dtype::E4m3 && options.mDtypeMmaB == tg::Dtype::MxE4m3), - "Unsupported cast for B: ", tg::dtypeToString(options.mDtypeB), " -> ", tg::dtypeToString(options.mDtypeMmaB)); - - if (options.mDtypeA != options.mDtypeMmaA) - { - TLLM_CHECK_ERROR(options.mTileM == 128, "TileM must be 128 when casting the input matrix A before the MMA."); - } - - if (options.mPatchF2fp) - { - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE2m1 && options.mDtypeMmaA == tg::Dtype::Bfloat16, - "PatchF2fp is only supported for MxFp4 to Bf16 casts."); - } - // FIXME: We do not support different dtypes for A and B when not on Blackwell. if (!isBlackwell) { - TLLM_CHECK_ERROR( - options.mDtypeMmaA == options.mDtypeMmaB, "For non-Blackwell, A and B must have the same dtype."); + TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For non-Blackwell, A and B must have the same dtype."); } // Check that the different dtypes for A and B are supported by the tensor core // kind::f8f6f4 - if (options.mDtypeMmaA == tg::Dtype::E4m3 || options.mDtypeMmaA == tg::Dtype::E2m1) + if (options.mDtypeA == tg::Dtype::E4m3 || options.mDtypeA == tg::Dtype::E2m1) { - TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1, - "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1."); + TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::E4m3 || options.mDtypeB == tg::Dtype::E2m1, + "For E4m3/E2m1 A, B must also be E4m3/E2m1."); } // kind::mxf8f6f4 - if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1) + if (options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1, - "For dtypeMmaA = MxE4m3 or MxE2m1, dtypeMmaB must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1, + "For dtypeA = MxE4m3 or MxE2m1, dtypeB must also be MxE4m3 or MxE2m1."); } - if (options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1) + if (options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1, - "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1, + "For dtypeB = MxE4m3 or MxE2m1, dtypeA must also be MxE4m3 or MxE2m1."); } // kind::f16 - if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16) + if (options.mDtypeA == tg::Dtype::Fp16 || options.mDtypeA == tg::Dtype::Bfloat16) { - TLLM_CHECK_ERROR(options.mDtypeMmaB == options.mDtypeMmaA, - "For dtypeMmaA = Fp16/Bfloat16, dtypeMmaB must be the same as dtypeMmaA."); + TLLM_CHECK_ERROR(options.mDtypeB == options.mDtypeA, "For Fp16/Bfloat16 A, B must be the same type as A."); } - // When one of the inputs needs to be cast, we must use two load warps. - if ((options.mDtypeMmaA != options.mDtypeA || options.mDtypeMmaB != options.mDtypeB) - && !options.mUseTwoTmaLoadWarps) - { - TLLM_LOG_WARNING("Two TMA load warps must be enabled if any of the inputs needs to be cast."); - } - - // When different dtypes are used for A and B, we must use different tiles to do the loading. + // When different dtype are used for A and B, we must use different tile to do the loading. // It is not strictly required, but current implementation of SmemAb requires that. if (options.mDtypeA != options.mDtypeB) { @@ -644,7 +547,7 @@ inline bool checkAndUpdateGemmOptions( { if (updateOptions) { - options.mMmaKind = dtypeGetMmaKind(options.mDtypeMmaA, options.mDtypeMmaB); + options.mMmaKind = dtypeGetMmaKind(options.mDtypeA, options.mDtypeB); } else { @@ -652,6 +555,11 @@ inline bool checkAndUpdateGemmOptions( } } + if (options.mMmaKind == tg::MmaKind::Fp16) + { + TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For Fp16 MMA, A and B must have the same dtype."); + } + if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) && options.mMmaK != 32) { @@ -718,6 +626,9 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); + TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4, + "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32; if (options.mMmaK != mmaK) { @@ -735,56 +646,21 @@ inline bool checkAndUpdateGemmOptions( } } - // The MMA N may only be smaller than 64 if it is equal to the tile N. - TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, - ") must be >= 64 or equal to TileN (", options.mTileN, ")"); - } - if (tg::dtypeIsBlockFmt(options.mDtypeA)) - { - int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); - auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; - TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, - ") must be a multiple of 4"); - } - if (tg::dtypeIsBlockFmt(options.mDtypeB)) - { - TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 - || options.mSfLayoutB == tg::SfLayout::Linear, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); - // TileN must be a multiple of the number of rows per SF tile. int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsB == 0, "TileN (", options.mTileN, ") must be a multiple of ", numSfTileRowsB, " for B SF layout ", tg::sfLayoutToString(options.mSfLayoutB)); + // The MMA N may only be smaller than 64 if it is equal to the tile N. + TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, + ") must be >= 64 or equal to TileN (", options.mTileN, ")"); + int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); + TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfB) == 0, "TileK (", options.mTileK, ") must be a multiple of ", (4 * numEltsPerSfB), " for typeB ", gemm::toString(options.mDtypeB)); - auto const numEltsPerSfBInK = options.mK / numEltsPerSfB; - TLLM_CHECK_ERROR(numEltsPerSfBInK % 4 == 0, "K dimension of scaling factors for B (", numEltsPerSfBInK, - ") must be a multiple of 4"); } - - int32_t padMultiplierA = 1; - int32_t padMultiplierB = 1; - if (options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) - { - if (options.mDtypeA == tg::Dtype::MxE2m1) - { - padMultiplierA = 2; - } - if (options.mDtypeB == tg::Dtype::MxE2m1) - { - padMultiplierB = 2; - } - } - TLLM_CHECK_ERROR((padMultiplierA * tg::dtypeGetNumBits(options.mDtypeA) * options.mK / 8) % 16 == 0, - "K dimension of A must be aligned to 16 bytes."); - TLLM_CHECK_ERROR((padMultiplierB * tg::dtypeGetNumBits(options.mDtypeB) * options.mK / 8) % 16 == 0, - "K dimension of B must be aligned to 16 bytes."); - if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); @@ -792,10 +668,8 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mSfLayoutC == tg::SfLayout::R128c4 || options.mSfLayoutC == tg::SfLayout::R8c4, "Only the 128x4 and 8x4 SF layouts are supported for C."); int const numSfTileRowsC = options.mSfLayoutC == tg::SfLayout::R128c4 ? 128 : 8; - int const tileTokenDim = options.mTransposeMmaOutput ? options.mTileN : options.mTileM; - TLLM_CHECK_ERROR_FMT(tileTokenDim % numSfTileRowsC == 0, - "Tile%s (%d) must be a multiple of %d for C SF layout %s", options.mTransposeMmaOutput ? "N" : "M", - tileTokenDim, numSfTileRowsC, tg::sfLayoutToString(options.mSfLayoutC).c_str()); + TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsC == 0, "TileN (", options.mTileN, ") must be a multiple of ", + numSfTileRowsC, " for C SF layout ", tg::sfLayoutToString(options.mSfLayoutC)); int const hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC); @@ -879,6 +753,7 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mM > 0 && options.mN > 0 && options.mK > 0, "M, N and K must be larger than 0"); TLLM_CHECK_ERROR(options.mNumSlicesForSplitK > 0, "Split K must be larger than 0."); + TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); if (options.mUseShuffledMatrixA) { @@ -1036,11 +911,6 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR( options.mNumStagesMmaWithinWorkTile == 1, "Non-DeepSeekFp8 requires numStagesMmaWithinWorkTile == 1"); - if (options.mNumStagesMma > 1) - { - TLLM_CHECK_ERROR(options.mTileScheduler == TileScheduler::Persistent, - "Non-DeepSeekFp8 requires persistent scheduler when using numStagesMma >1"); - } } if (options.mUseDeepSeekFp8) { @@ -1053,7 +923,6 @@ inline bool checkAndUpdateGemmOptions( // Check that TileK = 128 for correct scaling of every 128 channels. TLLM_CHECK_ERROR(options.mTileK == 128, "Tile-K must be equal to 128 for DeepSeek Fp8"); - TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); // Tile sizes of the output hidden dimension. auto hiddenDimPerOutputTile = options.mTransposeMmaOutput ? options.mTileM : options.mTileN; auto hiddenDimPerEpilogueTile = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; @@ -1128,22 +997,14 @@ inline bool checkAndUpdateGemmOptions( if (options.mUseUnrollLoop2xForMma) { - // Number of iterations in K dimension after padding. - // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations. - // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is - // - // ceil(512 / (128*3)) * (128*3) = 768 - // - int paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK); - // Check that the padded K (K rounded to next multiple of tileK) is a multiple of 2*TileK when - // UnrollLoop2x is enabled. This is to avoid deadlock when mma runs even-numbered loop while the - // other warps run odd-numbered loop. - // - bool notSupported = (paddedK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; + bool notSupported = (options.mK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; + // Check that the 2*TileK is a multiple of MmaK when UnrollLoop2x is enabled. + // This is to avoid deadlock when mma runs even-numbered loop while the other warps run + // odd-numbered loop. if (notSupported) { TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=", options.mTileK, - " and K=", options.mK, " (paddedK=", paddedK, ") and numSlicesForSplitK=", options.mNumSlicesForSplitK, + " and K=", options.mK, " and numSlicesForSplitK=", options.mNumSlicesForSplitK, ". Disabling unrollLoop2xForMma."); if (updateOptions) { @@ -1198,108 +1059,43 @@ inline bool checkAndUpdateGemmOptions( // // Kernel 1: ----PREEXIT-----------FLUSH // Kernel 2: -------PREEXIT----ACQBULK---FLUSH - // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible - // ----------------------- - // Warp 1: ---- (!) We normally assume that 1 is visible is not yet - // visible- Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible - // ---------- + // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible ----------------------- + // Warp 1: ---- (!) We normally assume that 1 is visible is not yet visible- + // Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible ---------- TLLM_CHECK_ERROR((options.mGridWaitForPrimaryA || !options.mGridTriggerSecondaryA), "A: If a task triggers a secondary kernel, it must also wait for primary kernel."); TLLM_CHECK_ERROR((options.mGridWaitForPrimaryB || !options.mGridTriggerSecondaryB), "B: If a task triggers a secondary kernel, it must also wait for primary kernel."); - if (options.mUsePerTokenSfA || options.mUsePerTokenSfB) - { - // Checks applicable to both MetaFP8 and RoutingScalesOnInput - TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "DeepSeek FP8 and per-token scaling are not compatible"); - TLLM_CHECK_ERROR(isBlackwell, "Per-token scaling is not supported for Hopper"); - if (options.mUsePerTokenSfA && options.mUsePerTokenSfB) - { - // MetaFP8 case - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, - "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), - " dtypeB=", tg::dtypeToString(options.mDtypeB)); - } - else - { - // RoutingScalesOnInput case - TLLM_CHECK_ERROR((options.mUsePerTokenSfA && !options.mTransposeMmaOutput) - || (options.mUsePerTokenSfB && options.mTransposeMmaOutput), - "In RoutingScalesOnInput mode, perToken scales must be used on activations"); - } - } - // The generation should support non K-major layouts for both A and B; however, it is unclear if // there is a use-case - TLLM_CHECK_ERROR((options.mLayoutA == MatrixLayout::MajorK) || (options.mLayoutB == MatrixLayout::MajorK), - "At least one matrix must be in k-major layout"); + TLLM_CHECK_ERROR(!options.mTransposeMatrixA || options.mTransposeMatrixB, + "TransposeA true and TransposeB false is not supported"); // Some features are currently only support when both matrices are in K-major format - if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) + if (options.mTransposeMatrixA || !options.mTransposeMatrixB) { TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell"); TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K"); } - if (options.mLayoutA == MatrixLayout::MajorMn) + if (options.mTransposeMatrixA) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeA) >= 8, "Subbyte types only support K major layout"); } - if (options.mLayoutB == MatrixLayout::MajorMn) + if (!options.mTransposeMatrixB) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeB) >= 8, "Subbyte types only support K major layout"); } - if ((options.mLayoutA == MatrixLayout::BlockMajorK) || (options.mLayoutB == MatrixLayout::BlockMajorK)) - { - bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; - - // Block K size must be 128B. - // TODO Leaving this as an option for now in case we want to expertiment with other block sizes - // As the user is not expected to set this, do not fail if updateOptions is false - int32_t const elemSizeInBits - = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); - int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; - - if (options.mBlockK != elemsIn128B) - { - if (updateOptions) - { - options.mBlockK = elemsIn128B; - } - else - { - return false; - } - } - - if (options.mBlockK > options.mTileK) - { - TLLM_CHECK_ERROR(options.mBlockK % options.mTileK == 0, - "If block size is greater than tile size, block size must be a multiple of tile size"); - } - else if (options.mBlockK < options.mTileK) - { - TLLM_CHECK_ERROR(options.mTileK % options.mBlockK == 0, - "If tile size is greater than block size, tile size must be a multiple of block size"); - } - } - - if (!isBiasTypeNone(options.mBiasType)) - { - TLLM_CHECK_ERROR(!isBiasTypeMn(options.mBiasType), "BiasType::Mn is not supported"); - TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "Bias is not supported for DeepSeek Fp8"); - TLLM_CHECK_ERROR(!(options.mUsePerTokenSfA && options.mUsePerTokenSfB), "Bias is not supported for Meta Fp8"); - } - if (updateOptions) { // Init kernel traits. options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc, - options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, - options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, - options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, - options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, - options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType); + options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, options.mEpilogueTileM, + options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, options.mNumSlicesForSplitK, + options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, options.mTransposeMmaOutput, + options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8, + options.mUsePerTokenSfA, options.mUsePerTokenSfB); } return true; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h index 7a748fefae..5d55ff418b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h @@ -28,115 +28,113 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "32110ebf-dirty" -#define TLLM_GEN_EXPORT_VERSION "7.0" +#define TLLM_GEN_COMMIT "744dc79e" +#define TLLM_GEN_EXPORT_VERSION "6.0" static constexpr size_t tllmGenGemmListLen = 46; #ifndef EXCLUDE_SM_100 -extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; -extern unsigned char Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; -extern unsigned char Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; +extern unsigned char GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; -extern unsigned int Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; -extern unsigned int Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; +extern unsigned int GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const gemm::GemmConfig tllmGenGemmList[] = { #ifndef EXCLUDE_SM_100 -{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "67bad780d8f03b24804e34cc5317720c13949c72009f311e9d17a1cd6b10819a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 +{GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 150528, "gemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -144,8 +142,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -162,8 +158,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -179,10 +173,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -199,12 +194,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "f71c31c377d83567ce6db02f270354c85d9cab4e8543726c4f0322121eea617c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -212,8 +204,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -230,8 +220,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -247,10 +235,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -267,12 +256,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "257908af538388410f3ec0d3e5108e288a5c77d2f9b02383618d5fb08002ee51", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -280,8 +266,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -298,8 +282,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -315,10 +297,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -335,12 +318,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 168960, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "550d553ade1407a25f2520568149e07b978f150afef43118397e6aa90111bd9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 168960, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -348,8 +328,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -366,8 +344,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -383,10 +359,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -403,12 +380,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "de2f28d823b6eb726debee79d8733d8656821252accd929d3158cace2b6a845a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -416,8 +390,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -434,8 +406,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -451,10 +421,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -471,12 +442,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "e103cc3353cc9f652e99cbf588ba053ba3b914596dd93c87b25f6e7f1225b5cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -484,8 +452,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -502,8 +468,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -519,10 +483,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -539,12 +504,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "a75f88e0c9791b7f53e6c6e764784015b47b2f0a97b22896823a15cc64f69e5a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -552,8 +514,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -570,8 +530,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -587,10 +545,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -607,12 +566,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "dca5f0c1dcb998e0baa7db6c4a023d5a06e19de4dbb9ce9edb6dd9edc89bb431", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -620,8 +576,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -638,8 +592,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -655,10 +607,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -675,12 +628,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "ba2d2e2e05b540cf1f56d35c87287a255f1d2043e8970acaf57c928bb5ece183", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -688,8 +638,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -706,8 +654,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -723,10 +669,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -743,12 +690,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "3128c19b37c22de699fc92e22cbee5d4b05c28dfb85418044ce1eee6f3e9744f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -756,8 +700,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -774,8 +716,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -791,10 +731,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -811,12 +752,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "34d0f58dd43e9428e983e7b8bc8c1d703ee9a0cc46f73cb0bf99c40f7de542df", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -824,8 +762,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -842,8 +778,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -859,10 +793,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -879,12 +814,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "007f0ffe31b6104e71385d9f7f378e7974fd942fb6f528eb3ac28b387eed6338", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -892,8 +824,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -910,8 +840,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -927,10 +855,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -947,12 +876,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 217088, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "cd112e5364c7b204daf52b09f6fc23d37f7292fa14f970b050905c53cd71e487", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 217088, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -960,8 +886,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -978,8 +902,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -995,10 +917,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1015,12 +938,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "cb469548e2f1507579cf58c9cd864472a38203939a81cecd43376c8086839601", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -1028,8 +948,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1046,8 +964,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1063,10 +979,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1083,12 +1000,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 225280, "gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "6aad723e3a1f1267f892edbc89a7b95e7058daaae6e65c7a1d8a81968b42df58", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 225280, "gemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1096,8 +1010,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1114,8 +1026,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -1131,10 +1041,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1151,12 +1062,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "fd16bb4cb68c22a7d6135c131cf9458e054960de0adad06a596aca06a0d4f723", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 158720, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1164,8 +1072,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1182,8 +1088,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1199,10 +1103,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1219,12 +1124,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "92f81034b485c2b6724cb83002fdcddb1d102a21b69e265c851263198b16d15b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 158720, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1232,8 +1134,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1250,8 +1150,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1267,10 +1165,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1287,12 +1186,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 218112, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "0b5cf51c225dd33ce5348a23c87bce062df590938af2547a8408e6bddf563bb5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 218112, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1300,8 +1196,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1318,8 +1212,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1335,10 +1227,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1355,12 +1248,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 82944, "gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "35b5bd8342a4ae619b7a441f48e5c96afe7047c4c05e8e8a2fed2023699008bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 82944, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1368,8 +1258,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1386,8 +1274,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1403,10 +1289,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1423,12 +1310,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 82944, "gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "995b3f8d3be717372fceccd1bcbf9d811a191685ae50208f29f9779b0f1c20e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 82944, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1436,8 +1320,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1454,8 +1336,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1471,10 +1351,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1491,12 +1372,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 93184, "gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "4ae45f02a96abc1fee68d5d482590733206bb21974d7540b244c72599d42b029", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 93184, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1504,8 +1382,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1522,8 +1398,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1539,10 +1413,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -1559,12 +1434,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 93184, "gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "eb325f16f32466cef919bc05f3a08c49ca6ca7d4fb02cb87086a33b3c2893ae6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 93184, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1572,8 +1444,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1590,8 +1460,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1607,10 +1475,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -1627,12 +1496,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 115712, "gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "e0cde71b5344cd24a607f59f690775b198c2dbe8caf1194b82ebfcfe0e7d22a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 115712, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1640,8 +1506,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1658,8 +1522,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1675,10 +1537,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -1695,12 +1558,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 115712, "gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "f9407c6f8fba4f59f4f8568987b1d29a542fc3076e8cdff5cb80e74d9f4ddcf0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 115712, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1708,8 +1568,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1726,8 +1584,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1743,10 +1599,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -1763,12 +1620,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "2cfc6cace0893b2a2866d06b24146d77b9c2568d371b9e41337f99f020ddf6e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1776,8 +1630,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1794,8 +1646,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1811,10 +1661,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1831,12 +1682,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "67414166d1a8a94e40a6f6a2f7d24e99f7634eeaacd015098f212368bd3bc5bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1844,8 +1692,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1862,8 +1708,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1879,10 +1723,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1899,12 +1744,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 216064, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "99dbd99a0e95a841e9416a1099a40fd4e2b42f1a44fd0352ac331b307cee14f4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 216064, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1912,8 +1754,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1930,8 +1770,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1947,10 +1785,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1967,12 +1806,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "5b1f3d6e3705a32cc257f0566b57dba0ae89c90aae749090ea864e3ac1e152d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -1980,8 +1816,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1998,8 +1832,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2015,10 +1847,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2035,12 +1868,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "294b977b50865e7fbe41ef9d006e1912856c96b4e15a7588e24d783e044d0929", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 150528, "gemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2048,8 +1878,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2066,8 +1894,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -2083,10 +1909,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2103,12 +1930,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "d05d164e79e213a9b04ce518ced20ec9faad7967ad951cff14a9e60ef47a7047", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2116,8 +1940,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2134,8 +1956,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2151,10 +1971,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2171,12 +1992,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "d2975334a8914f64992a06417b40d42b1c25e8a57dff639bdb8b0768faea4037", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2184,8 +2002,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2202,8 +2018,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2219,10 +2033,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2239,12 +2054,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 168960, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "277bd69ed7d198f9fc8ab87c7d9df3f74762dcc48a845711fa881ef1a345c03d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 168960, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2252,8 +2064,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2270,8 +2080,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2287,10 +2095,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2307,12 +2116,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "749b06758d88b8c6c9233bef06af59f5c717e767d0ab7636726a6f5808aebec9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2320,8 +2126,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2338,8 +2142,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2355,10 +2157,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -2375,12 +2178,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "4f107da93c4869724202d13aa7d6f69a618ec18367ed1f434efb804e9e2950cb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2388,8 +2188,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2406,8 +2204,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2423,10 +2219,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -2443,12 +2240,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "05b668ebaed4847a3ed92ce43d78442fd006621261f095ddb36ba150ed2d4ad9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2456,8 +2250,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2474,8 +2266,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2491,10 +2281,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2511,12 +2302,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "c2924dd87f53e3decedf433f205b742ae84cef4a77d7862441591ed0203b91d4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2524,8 +2312,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2542,8 +2328,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2559,10 +2343,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2579,12 +2364,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "3b6e8407873ded09c5e433d5aa3e4ec0323ba895d13aea285e3a92ce1836046f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2592,8 +2374,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2610,8 +2390,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2627,10 +2405,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2647,12 +2426,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "6a647bb4f09f60d57a88d8b744bf772a261b3ccc3a5706b4ef0396438ef19cba", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2660,8 +2436,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2678,8 +2452,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2695,10 +2467,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2715,12 +2488,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "98c0347b249a8f8c1e3a891f1532d0f3851ce781028f9d92c0dd18bf9705fd81", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2728,8 +2498,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2746,8 +2514,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2763,10 +2529,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2783,12 +2550,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "103144508c3b2e8d01fcb40ec5980f67b2bdd5da4687635039f007e5ac798546", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2796,8 +2560,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2814,8 +2576,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2831,10 +2591,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2851,12 +2612,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 217088, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "47a6adb00497e6864f5f3dd1eb9326de21daddb02576d665951f268568598a9d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 217088, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2864,8 +2622,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2882,8 +2638,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2899,10 +2653,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2919,12 +2674,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "0e025e6540a9eaf9e9d84635fcfcc4d63d563288d1210667a446c150eaf44620", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -2932,8 +2684,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2950,8 +2700,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2967,10 +2715,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2987,12 +2736,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 225280, "gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "acd6bcbe32966a092b0241457c15142a729a33e40ab4a3d5f9e5ada9d0ca80b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 225280, "gemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3000,8 +2746,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3018,8 +2762,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -3035,10 +2777,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3055,12 +2798,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "b9afcb7beb9cbdf56629ab3e7396c803e13f1a1410e569b60332a123a2aeea2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 183296, "gemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3068,8 +2808,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1056776) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3086,8 +2824,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -3103,10 +2839,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -3123,12 +2860,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 227328, "gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "0b7d9e47ffaf50c12ff6888aba42111a2d731fb6aada6a63f33ea578300a2add", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 227328, "gemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3136,8 +2870,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1056776) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3154,8 +2886,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -3171,10 +2901,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3191,12 +2922,9 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, -{Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 224256, "gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "84964cb97f1ba9d334e25c76018e5ab73cc2f1fcbee4391d7e76f61c52a64b9c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 224256, "gemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -3204,8 +2932,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(17827853) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3222,8 +2948,6 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -3239,10 +2963,11 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMatrixA */ 0 +, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -3259,12 +2984,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100a }, #endif // EXCLUDE_SM_100 }; // clang-format on + } // namespace kernels } // namespace tensorrt_llm } // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h index 17199d0f17..142e9728dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h @@ -22,10 +22,6 @@ #include "Enums.h" #include "TmaDescriptor.h" -// NOTE: keep this code dependency free. It has to be included by the device code and has to be -// compilable with NVRTC. -#include "KernelParamsDecl.h" - namespace gemm { @@ -33,305 +29,535 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// + namespace tg = trtllm::gen; -namespace KernelParamsSetup +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct KernelParams { #ifdef TLLM_ENABLE_CUDA + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Gemm parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// -using MatrixType = KernelParams::MatrixType; + // TMA descriptor for A. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAb. + // + // If transposeMatrixA is false + // Logical shape is [M, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeA. + // + // If transposeMatrixA is true + // Logical shape is [K, M]. + // Logical strides are [M, 1]. + // Tile box shape is [tileK, tileM]. + // Tile box strides are [tileM, 1]. + // Dtype is set from options.mDtypeA. + CUtensorMap tmaA; -// Create the TMA shape/stride for A/B. -template -static auto makeTmaShapeStrideAb(GemmOptions const& options, MatrixType matrixType) -{ - // The outer dimension. - auto numTokens = (matrixType == MatrixType::MatrixA) ? options.mM : options.mN; - // The outer dimension tile size. - auto tileMn = (matrixType == MatrixType::MatrixA) ? options.mTileM : options.mTileN; - // The inner dimension. - auto hiddenSize = options.mK; - // The cute tensor shape for A/B: (numTokens, hiddenSize). - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + // TMA descriptor for B. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAb. + // + // If transposeMatrixB is true + // Logical shape is [N, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeB. + // + // If transposeMatrixB is false + // Logical shape is [K, N]. + // Logical strides are [N, 1]. + // Tile box shape is [tileK, tileN]. + // Tile box strides are [tileN, 1]. + // Dtype is set from options.mDtypeB. + CUtensorMap tmaB; - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; + // TMA descriptor for C, (when useTmaStore is true) + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideC. + // + // If transposeMmaOutput is false, + // Logical shape is [M, N]. + // Logical strides are [N, 1]. + // Tile box shape is [epilogueTileM, epilogueTileN]. + // Tile box strides are [epilogueTileN, 1]. + // Dtype is set from options.mDtypeC. + // + // If transposeMmaOutput is true, + // Logical shape is [N, M]. + // Logical strides are [M, 1]. + // Tile box shape is [epilogueTileN, epilogueTileM]. + // Tile box strides are [epilogueTileM, 1]. + // Dtype is set from options.mDtypeC. + CUtensorMap tmaC; - // Assemble the box shape - std::vector tileShape = {options.mTileK, tileMn}; + // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for A is always R128c4 + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // K must be a multiple of 4P. + // The "logical" shape is: [M, K / P]. + // The R128c4 layout is: [⌈M / 128⌉, K / P / 4, 512]. + // The shape we use for TMA is: [⌈M / 128⌉, K / P / 4, 2, 256]. + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfA; - MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; - if (layout == MatrixLayout::MajorMn) + // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for B is controlled by options.mSfLayoutB. + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // The "logical" shape is: [N, K / P] + // + // If the layout is R128c4, + // K must be a multiple of 4P. + // The R128c4 layout is: [⌈N / 128⌉, K / P / 4, 512] + // The shape we use for TMA is: [⌈N / 128⌉, K / P / 4, 2, 256] + // + // If the layout is R8c4, + // K must be a multiple of 4P. + // The R8c4 layout is: [⌈N / 8⌉, K / P / 4, 32] + // The shape we use for TMA is: [⌈N / 8⌉, K / P / 4 / r, r * 32] + // where r = min(tileK / P / 4, 8) + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfB; + + // The output matrix C. The data type is controlled by options.mDtypeC. + // + // When transposeMmaOutput is true, the shape is [N, M]. + // Otherwise, the shape is [M, N]. + // Elements in a given row are stored contiguously in memory (row-major). + void* ptrC; + + // The block scaling factors to dequantize A. + // + // If DeepSeek FP8 recipe is used: + // If transposeMmaOutput is false, shape is [K / 128, M]. + // Otherwise, shape is [M / 128, K / 128]. + // The rightmost dimension is contiguous in memory. + // + // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: + // The layout and data type is the same as explained in tmaSfA. + // + // Otherwise should be set to nullptr. + void const* ptrSfA; + + // The scaling factors to dequantize B. + // + // If DeepSeek FP8 recipe is used: + // If transposeMmaOutput is false, shape is [N / 128, K / 128]. + // Otherwise, shape is [K / 128, N]. + // The rightmost dimension is contiguous in memory. + // + // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: + // The layout and data type is the same as explained in tmaSfB. + // + // Otherwise should be set to nullptr. + void const* ptrSfB; + + // The per-token scaling factors from scale A. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is not + // transposed). The dtype is Dtype::Bfloat16 + // + // The shape is [M] + void const* ptrPerTokenSfA; + + // The per-token scaling factors from scale B. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is + // transposed). The dtype is Dtype::Bfloat16 + // + // The shape is [N] + void const* ptrPerTokenSfB; + + // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also + // used for the DeepSeek FP8 recipe. + // + // For DeepSeek FP8 recipe: + // If transposeMmaOutput is false, shape is [N / 128, M]. + // Otherwise, shape is [M / 128, N]. + // The rightmost dimension is contiguous in memory. + // + // For MxFp{4,8} and NvFp4 formats: + // If transposeMmaOutput is false, shape is [M, N / 16]. + // Otherwise, shape is [N, M / 16]. + // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). + void* ptrSfC; + + // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. + // TensorRT-LLM API requires a scaling factor on the device. + // Shape is [1]. + float const* ptrScaleC; + + // The M dimension. + // It is the total number of tokens if A is the activation matrix. + // It is the total number of output channels if A is the weight matrix. + int32_t m; + // The N dimension. + // It is the total number of tokens if B is the activation matrix. + // It is the total number of output channels if B is the weight matrix. + int32_t n; + // The K dimension. It is the hidden dimension of the input matrices. + int32_t k; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // All-reduce parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The rank id of the current device in the multi-gpu space. + int rank; + // The number of peer devices in tensor-parallel group. + int tpGrpSize; + // Pointer for output with multicast mapping. It is used by the "reduce" op (LDGMC.ADD) of the + // two-shot reduce-scatter phase. + // The shape is [M, N] and the dtype is float. + void* multimemC; + + // The barriers in global memory. + // + // The kernel arrives at (with release ordering) the multicast mapping of the barrier to broadcast + // amongst peer devices. It then waits (with acquire ordering) for the unicast mapping of the + // barrier. + // + // Flags in global memory that sync on "entrance" of reduce-scatter phase in two-shot all-reduce. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the unicast memory created with IpcNvlsHandle. + // Must be set to 0 before the kernel launch. + void* ptrTileBars; + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the multicast memory created with IpcNvlsHandle. + void* multimemTileBars; + + // Flags in global memory that sync on "exit" after the all-reduce finishes. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the unicast memory created with IpcNvlsHandle. + // Must be set to 0 before the kernel launch. + void* ptrCompletionBars; + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the multicast memory created with IpcNvlsHandle + void* multimemCompletionBars; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Miscellaneous parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The barriers in global memory for Split-k reduction with exchange in GMEM. + // Each CTAs arrives at the barrier and blockIdx.z == gridDim.Z - 1 waits for the barrier to flip + // to perform a reduction. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // For DeepSeek FP8 recipe, the shape is [numTilesM * numTilesN * 2]. + // The memory must be set to 0 before the kernel launch. + void* ptrSplitKCompletionBars; + + // Pointer to the memory holding the partial sums for split-K in GMEM. + // The shape is [numSlicesForSplitK, numSlicesForSliceK, numTilesM * tileM, numTilesN * tileN]. + // The dtype is dtypeAcc, i.e. float. + void* ptrPartialSumsForSplitK; + + // In some cases, some CTAs need to exit early. E.g. when the grid is statically set, but the + // actual workload is decided at runtime. This device pointer maps to the number of non exiting + // CTAs in the X dim of the grid when transposeMmaOutput is false. And the Y dim, otherwise. + // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. + int32_t* ptrNumNonExitingCtas; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Miscellaneous parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + enum class MatrixType { + MatrixA = 0, + MatrixB + }; + + // Create the TMA shape/stride for A/B. + template + static auto makeTmaShapeStrideAb(GemmOptions const& options, MatrixType matrixType) + { + // The outer dimension. + auto numTokens = (matrixType == MatrixType::MatrixA) ? options.mM : options.mN; + // The inner dimension. + auto hiddenSize = options.mK; + // The cute tensor shape for A/B: (numTokens, hiddenSize). + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + auto stride = std::vector{1, static_cast(hiddenSize)}; + // Apply transpose if necessary - std::swap(shape[0], shape[1]); - stride[1] = numTokens; - std::swap(tileShape[0], tileShape[1]); - } - else if (layout == MatrixLayout::BlockMajorK) - { - // Set shapes based on blocking layout - shape = {static_cast(options.mBlockK), static_cast(numTokens), - static_cast(options.mK / options.mBlockK)}; - stride = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK)}; - - // If blockK > tileK, then the inner most box size will be based on the tile - int32_t const tileBlockK = std::min(options.mBlockK, options.mTileK); - tileShape = {tileBlockK, tileMn, options.mTileK / tileBlockK}; - } - - return std::make_tuple(shape, stride, tileShape); -} - -// Create the TMA shape/stride for C. -template -static auto makeTmaShapeStrideC(GemmOptions const& options) -{ - // The number of tokens. - auto numTokens = options.mTransposeMmaOutput ? options.mN : options.mM; - // The hidden dimension. - auto hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN; - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; - - return std::make_tuple(shape, stride); -} - -// Create the TMA shape/stride for A/B block scaling factors. -template -static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) -{ - // The outer dimension. - auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; - // The inner dimension. - auto hiddenSize = options.mK; - // The outer tile dimension. - auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; - // The inner tile dimension. - auto hiddenSizePerTile = options.mTileK; - // The dtype of the matrix. - tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; - // Number of elements per scaling factor. - int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; - - switch (layout) - { - case tg::SfLayout::R128c4: - { - // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. - // The 512B block maps to a 32x16B (32x128b) block in TMEM. - // See https://nvbugspro.nvidia.com/bug/4165523 - // - // Additionally, we have to meet constraints of TMA that the box dimensions are less - // than 256 and boxDim[0] is a multiple of 16B. - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The aforementioned format is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 512] - // The shape we use for TMA is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 2, 256] - - auto shape = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)), - static_cast(tg::ceilDiv(numTokens, 128))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) + if ((matrixType == MatrixType::MatrixA && options.mTransposeMatrixA) + || (matrixType == MatrixType::MatrixB && !options.mTransposeMatrixB)) { - stride[i] = shape[i - 1] * stride[i - 1]; + std::swap(shape[0], shape[1]); + stride[1] = numTokens; } - auto tileShapes - = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), - static_cast(tg::ceilDiv(numTokensPerTile, 128))}; - - return std::make_tuple(shape, stride, tileShapes); + return std::make_tuple(shape, stride); } - case tg::SfLayout::R8c4: + // Create the TMA shape/stride for C. + template + static auto makeTmaShapeStrideC(GemmOptions const& options) { - // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. - // - // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use - // fewer read requests, if the tile dimensions allow. It does not reduce the number of - // instructions. - // - // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] - // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] - // - // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of - // NumRepeats * numEltsPerSf * 4. + // The number of tokens. + auto numTokens = options.mTransposeMmaOutput ? options.mN : options.mM; + // The hidden dimension. + auto hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN; + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. - int const r = options.mSfReshapeFactor; - if (r > 0 && (r & (r - 1)) != 0) + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + auto stride = std::vector{1, static_cast(hiddenSize)}; + + return std::make_tuple(shape, stride); + } + + // Create the TMA shape/stride for A/B block scaling factors. + template + static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) + { + // The outer dimension. + auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; + // The inner dimension. + auto hiddenSize = options.mK; + // The outer tile dimension. + auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; + // The inner tile dimension. + auto hiddenSizePerTile = options.mTileK; + // The dtype of the matrix. + tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; + // Number of elements per scaling factor. + int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; + + switch (layout) { - throw std::runtime_error("mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); + case tg::SfLayout::R128c4: + { + // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. + // The 512B block maps to a 32x16B (32x128b) block in TMEM. + // See https://nvbugspro.nvidia.com/bug/4165523 + // + // Additionally, we have to meet constraints of TMA that the box dimensions are less + // than 256 and boxDim[0] is a multiple of 16B. + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The aforementioned format is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 512] + // The shape we use for TMA is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 2, 256] + + auto shape = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)), + static_cast(tg::ceilDiv(numTokens, 128))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) + { + stride[i] = shape[i - 1] * stride[i - 1]; + } + + auto tileShapes + = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), + static_cast(tg::ceilDiv(numTokensPerTile, 128))}; + + return std::make_tuple(shape, stride, tileShapes); } - // Sanitize number of repeats so it doesn't exceed the dimension. - int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); - - // Detect if the input hidden size K is a multiple of the repeats. - if (tg::ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) + case tg::SfLayout::R8c4: { - throw std::runtime_error("SF hiddenSize K (" + std::to_string(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)) - + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); + // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. + // + // As the inner dimension (k) is required to be a multiple of the tile size, we + // can reshape to use fewer read requests, if the tile dimensions allow. + // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf * r), r * 32] + + int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), 8); + + auto shape = std::vector{static_cast(repeats * 32), + static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), + static_cast(tg::ceilDiv(numTokens, 8))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) + { + stride[i] = shape[i - 1] * stride[i - 1]; + } + + auto tileShapes = std::vector{static_cast(repeats * 32), + static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), + static_cast(tg::ceilDiv(numTokensPerTile, 8))}; + + return std::make_tuple(shape, stride, tileShapes); } - auto shape = std::vector{static_cast(repeats * 32), - static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), - static_cast(tg::ceilDiv(numTokens, 8))}; + default: throw std::runtime_error("Unsupported SF layout"); + } + return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); + } - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) + // Setup the kernel parameters. + template + static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, + void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void* ptrC, + void* ptrSfC, void* multimemC, float* ptrScaleC, void* ptrPartialSumsForSplitK, void* ptrTileBars, + void* multimemTileBars, void* ptrCompletionBars, void* multimemCompletionBars, void* ptrSplitKCompletionBars, + int32_t* ptrNumNonExitingCtas, int rank, int tpGrpSize) + { + + // Is one-shot all-reduce? + bool const oneShotAr{options.mAllReduceAlgo == AllReduceAlgo::OneShot}; + // Is two-shot all-reduce? + bool const twoShotAr{options.mAllReduceAlgo == AllReduceAlgo::TwoShot}; + // Are there peer devices? + bool const multiDevice{tpGrpSize > 1}; + + // Create the return struct. + KernelParams params; + + // Shape/stride for gmem tensor A. + auto [shapeA, strideA] = makeTmaShapeStrideAb(options, MatrixType::MatrixA); + // Build tma descriptor for A. + params.tmaA = gemm::buildNdTmaDescriptor(options.mDtypeA, options.mMmaKind, shapeA, strideA, + options.mTransposeMatrixA ? options.mTileK : options.mTileM, + options.mTransposeMatrixA ? options.mTileM : options.mTileK, const_cast(ptrA)); + + // Shape/stride for gmem tensor B. + auto [shapeB, strideB] = makeTmaShapeStrideAb(options, MatrixType::MatrixB); + // Build tma descriptor for B. + params.tmaB = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB, + !options.mTransposeMatrixB ? options.mTileK : options.mTileN, + !options.mTransposeMatrixB ? options.mTileN : options.mTileK, const_cast(ptrB), + /* swizzle */ !options.mSliceK); + + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE2m1 + || options.mDtypeA == tg::Dtype::MxE4m3) { - stride[i] = shape[i - 1] * stride[i - 1]; + tg::Dtype const dTypeSfA = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + // Build TMA descriptor for gmem A block scaling factors. + auto [shapeSfA, strideSfA, tileShapesSfA] + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); + params.tmaSfA + = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); } - auto tileShapes = std::vector{static_cast(repeats * 32), - static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), - static_cast(tg::ceilDiv(numTokensPerTile, 8))}; - - return std::make_tuple(shape, stride, tileShapes); - } - - default: throw std::runtime_error("Unsupported SF layout"); - } - return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); -} - -// Setup the kernel parameters. -template -static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, - void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void const* ptrBias, - void* ptrC, void* ptrSfC, void* multimemC, float* ptrScaleC, void* ptrPartialSumsForSplitK, void* ptrTileBars, - void* multimemTileBars, void* ptrCompletionBars, void* multimemCompletionBars, void* ptrSplitKCompletionBars, - int32_t* ptrNumNonExitingCtas, int rank, int tpGrpSize) -{ - - // Is one-shot all-reduce? - bool const oneShotAr{options.mAllReduceAlgo == AllReduceAlgo::OneShot}; - // Is two-shot all-reduce? - bool const twoShotAr{options.mAllReduceAlgo == AllReduceAlgo::TwoShot}; - // Are there peer devices? - bool const multiDevice{tpGrpSize > 1}; - - // Create the return struct. - KernelParams params; - - // Shape/stride for gmem tensor A. - auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAb(options, MatrixType::MatrixA); - // Build tma descriptor for A. - params.tmaA = gemm::buildNdTmaDescriptor( - options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); - - // Shape/stride for gmem tensor B. - auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAb(options, MatrixType::MatrixB); - // Build tma descriptor for B. - params.tmaB = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, - const_cast(ptrB), - /* swizzle */ !options.mSliceK); - - if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE2m1 - || options.mDtypeA == tg::Dtype::MxE4m3) - { - tg::Dtype const dTypeSfA = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); - params.tmaSfA - = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); - } - - if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE2m1 - || options.mDtypeB == tg::Dtype::MxE4m3) - { - tg::Dtype const dTypeSfB = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); - params.tmaSfB - = gemm::buildSfTmaDescriptor(dTypeSfB, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); - } - - if (options.mUseTmaStore) - { - // Shape/stride for gmem tensor C. - auto [shapeC, strideC] = makeTmaShapeStrideC(options); - - // Swap M and N tiles for the M-major epilogue. - auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; - auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; - - // One-shot performs TMA reduction on multicast mapping of the output buffer directly. - // Two-shot performs TMA store on unicast mapping of the output buffer. The reduction happens - // in the next phase. - void* ptrTmaC{oneShotAr && multiDevice ? multimemC : ptrC}; - auto dtypeC{options.mDtypeC}; - // Regardless of output dtype, two-shot all-reduce store partial - // accumulation results to global memory in float32 precision. - if (twoShotAr && multiDevice) + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE2m1 + || options.mDtypeB == tg::Dtype::MxE4m3) { - dtypeC = options.mDtypeAcc; + tg::Dtype const dTypeSfB = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + // Build TMA descriptor for gmem B block scaling factors. + auto [shapeSfB, strideSfB, tileShapesSfB] + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); + params.tmaSfB + = gemm::buildSfTmaDescriptor(dTypeSfB, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); } - // Build tma descriptor for C. - params.tmaC = gemm::buildNdTmaDescriptor(dtypeC, tg::MmaKind::Auto, shapeC, strideC, - std::vector{outputTileN, outputTileM}, const_cast(ptrTmaC)); + if (options.mUseTmaStore) + { + // Shape/stride for gmem tensor C. + auto [shapeC, strideC] = makeTmaShapeStrideC(options); + + // Swap M and N tiles for the M-major epilogue. + auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; + auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; + + // One-shot performs TMA reduction on multicast mapping of the output buffer directly. + // Two-shot performs TMA store on unicast mapping of the output buffer. The reduction happens + // in the next phase. + void* ptrTmaC{oneShotAr && multiDevice ? multimemC : ptrC}; + auto dtypeC{options.mDtypeC}; + // Regardless of output dtype, two-shot all-reduce store partial + // accumulation results to global memory in float32 precision. + if (twoShotAr && multiDevice) + { + dtypeC = options.mDtypeAcc; + } + + // Build tma descriptor for C. + params.tmaC = gemm::buildNdTmaDescriptor( + dtypeC, tg::MmaKind::Auto, shapeC, strideC, outputTileM, outputTileN, const_cast(ptrTmaC)); + } + + // Set the dequantization factors for A and B when DeepSeek FP8 recipe is used. + params.ptrSfA = ptrSfA; + params.ptrSfB = ptrSfB; + + // Set the per-token scale factors for MetaFP8 or scale inputs + params.ptrPerTokenSfA = ptrPerTokenSfA; + params.ptrPerTokenSfB = ptrPerTokenSfB; + + // Also set ptrC (it may be used by the NCCL reduction code in "layers/Llama"). + params.ptrC = ptrC; + params.ptrScaleC = ptrScaleC; + + // The block scaling factors of C for MxFp{4,8} and NvFp4 formats. + // (not to be confused with the tensor-level scaling factor stored in ptrScaleC) + params.ptrSfC = ptrSfC; + + params.m = options.mM; + params.n = options.mN; + params.k = options.mK; + + params.rank = rank; + params.tpGrpSize = tpGrpSize; + + params.multimemC = multimemC; + params.ptrPartialSumsForSplitK = ptrPartialSumsForSplitK; + params.ptrTileBars = ptrTileBars; + params.multimemTileBars = multimemTileBars; + params.ptrCompletionBars = ptrCompletionBars; + params.multimemCompletionBars = multimemCompletionBars; + + params.ptrSplitKCompletionBars = ptrSplitKCompletionBars; + params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; + return params; } - // Set the dequantization factors for A and B when DeepSeek FP8 recipe is used. - params.ptrSfA = ptrSfA; - params.ptrSfB = ptrSfB; - - // Set the per-token scale factors for MetaFP8 or scale inputs - params.ptrPerTokenSfA = ptrPerTokenSfA; - params.ptrPerTokenSfB = ptrPerTokenSfB; - - // Set the bias. - params.ptrBias = ptrBias; - - // Also set ptrC (it may be used by the NCCL reduction code in "layers/Llama"). - params.ptrC = ptrC; - params.ptrScaleC = ptrScaleC; - - // The block scaling factors of C for MxFp{4,8} and NvFp4 formats. - // (not to be confused with the tensor-level scaling factor stored in ptrScaleC) - params.ptrSfC = ptrSfC; - - params.m = options.mM; - params.n = options.mN; - params.k = options.mK; - - params.rank = rank; - params.tpGrpSize = tpGrpSize; - - params.multimemC = multimemC; - params.ptrPartialSumsForSplitK = ptrPartialSumsForSplitK; - params.ptrTileBars = ptrTileBars; - params.multimemTileBars = multimemTileBars; - params.ptrCompletionBars = ptrCompletionBars; - params.multimemCompletionBars = multimemCompletionBars; - - params.ptrSplitKCompletionBars = ptrSplitKCompletionBars; - params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; - return params; -} + // Setup the kernel parameters. + template + static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrB, void* ptrC, + void* multimemC, float const* ptrScaleC, void* ptrTileBars, void* multimemTileBars, void* ptrCompletionBars, + void* multimemCompletionBars, int rank, int tpGrpSize) + { + return setKernelParams(options, ptrA, nullptr, ptrB, nullptr, ptrC, multimemC, ptrScaleC, ptrTileBars, + multimemTileBars, ptrCompletionBars, multimemCompletionBars, rank, tpGrpSize); + } #endif -}; // namespace KernelParamsSetup +}; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h deleted file mode 100644 index f248278acc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h +++ /dev/null @@ -1,324 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & - * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -// NOTE: keep this code dependency free. It has to be included by the device code and has to be -// compilable with NVRTC. - -namespace gemm -{ - -namespace gemm -{ - -struct KernelParams -{ -#ifdef TLLM_ENABLE_CUDA - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Gemm parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // TMA descriptor for A. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If layoutA is MatrixLayout::MajorK - // Logical shape is [M, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileM, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeA. - // - // If layoutA is MatrixLayout::MajorMn - // Logical shape is [K, M]. - // Logical strides are [M, 1]. - // Tile box shape is [tileK, tileM]. - // Tile box strides are [tileM, 1]. - // Dtype is set from options.mDtypeA. - // - // If layoutA is MatrixLayout::BlockMajorK - // Logical shape is [K / blockK, M, blockK]. - // Logical strides are [M * blockK, blockK, 1]. - // Tile box shape is [tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. - // Tile box strides are [tileM * min(blockK, tileK), min(blockK, tileK), 1]. - // Dtype is set from options.mDtypeA, and blockK is 128B. - CUtensorMap tmaA; - - // TMA descriptor for B. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If layoutB is MatrixLayout::MajorK - // Logical shape is [N, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileN, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeB. - // - // If layoutB is MatrixLayout::MajorMn - // Logical shape is [K, N]. - // Logical strides are [N, 1]. - // Tile box shape is [tileK, tileN]. - // Tile box strides are [tileN, 1]. - // Dtype is set from options.mDtypeB. - // - // If layoutB is MatrixLayout::BlockMajorK - // Logical shape is [K / blockK, N, blockK]. - // Logical strides are [N * blockK, blockK, 1]. - // Tile box shape is [tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. - // Tile box strides are [tileN * min(blockK, tileK), min(blockK, tileK), 1]. - // Dtype is set from options.mDtypeB, and blockK is 128B. - CUtensorMap tmaB; - - // TMA descriptor for C, (when useTmaStore is true) - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideC. - // - // If transposeMmaOutput is false, - // Logical shape is [M, N]. - // Logical strides are [N, 1]. - // Tile box shape is [epilogueTileM, epilogueTileN]. - // Tile box strides are [epilogueTileN, 1]. - // Dtype is set from options.mDtypeC. - // - // If transposeMmaOutput is true, - // Logical shape is [N, M]. - // Logical strides are [M, 1]. - // Tile box shape is [epilogueTileN, epilogueTileM]. - // Tile box strides are [epilogueTileM, 1]. - // Dtype is set from options.mDtypeC. - CUtensorMap tmaC; - - // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for A is always R128c4 - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // K must be a multiple of 4P. - // The "logical" shape is: [M, K / P]. - // The R128c4 layout is: [⌈M / 128⌉, K / P / 4, 512]. - // The shape we use for TMA is: [⌈M / 128⌉, K / P / 4, 2, 256]. - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfA; - - // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for B is controlled by options.mSfLayoutB. - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // The "logical" shape is: [N, K / P] - // - // If the layout is R128c4, - // K must be a multiple of 4P. - // The R128c4 layout is: [⌈N / 128⌉, K / P / 4, 512] - // The shape we use for TMA is: [⌈N / 128⌉, K / P / 4, 2, 256] - // - // If the layout is R8c4, - // K must be a multiple of 4P. - // The R8c4 layout is: [⌈N / 8⌉, K / P / 4, 32] - // The shape we use for TMA is: [⌈N / 8⌉, K / P / 4 / r, r * 32] - // where r = min(tileK / P / 4, 8) - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfB; - - // The output matrix C. The data type is controlled by options.mDtypeC. - // - // When transposeMmaOutput is true, the shape is [N, M]. - // Otherwise, the shape is [M, N]. - // Elements in a given row are stored contiguously in memory (row-major). - void* ptrC; - - // The block scaling factors to dequantize A. - // - // If DeepSeek FP8 recipe is used: - // If transposeMmaOutput is false, shape is [K / 128, M]. - // Otherwise, shape is [M / 128, K / 128]. - // The rightmost dimension is contiguous in memory. - // - // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: - // The layout and data type is the same as explained in tmaSfA. - // - // Otherwise should be set to nullptr. - void const* ptrSfA; - - // The scaling factors to dequantize B. - // - // If DeepSeek FP8 recipe is used: - // If transposeMmaOutput is false, shape is [N / 128, K / 128]. - // Otherwise, shape is [K / 128, N]. - // The rightmost dimension is contiguous in memory. - // - // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: - // The layout and data type is the same as explained in tmaSfB. - // - // Otherwise should be set to nullptr. - void const* ptrSfB; - - // The bias applied after the GEMM. - // The bias is applied before applying the global scaling factor. I.e. - // C' = (A * B + bias') * scaleC - // scaleC = dequantA * dequantB * quantC - // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. - // - // if BiasType is N, the shape is [N]. - // The bias is broadcasted along the M dimension. - // - // if BiasType is M, the shape is [M]. - // The bias is broadcasted along the N dimension. - // - // The dtype is float32. - void const* ptrBias; - - // The per-token scaling factors from scale A. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is not - // transposed). The dtype is Dtype::Bfloat16 - // - // The shape is [M] - void const* ptrPerTokenSfA; - - // The per-token scaling factors from scale B. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is - // transposed). The dtype is Dtype::Bfloat16 - // - // The shape is [N] - void const* ptrPerTokenSfB; - - // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also - // used for the DeepSeek FP8 recipe. - // - // For DeepSeek FP8 recipe: - // If transposeMmaOutput is false, shape is [N / 128, M]. - // Otherwise, shape is [M / 128, N]. - // The rightmost dimension is contiguous in memory. - // - // For MxFp{4,8} and NvFp4 formats: - // If transposeMmaOutput is false, shape is [M, N / 16]. - // Otherwise, shape is [N, M / 16]. - // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). - void* ptrSfC; - - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. - // Shape is [1]. - float const* ptrScaleC; - - // The M dimension. - // It is the total number of tokens if A is the activation matrix. - // It is the total number of output channels if A is the weight matrix. - int32_t m; - // The N dimension. - // It is the total number of tokens if B is the activation matrix. - // It is the total number of output channels if B is the weight matrix. - int32_t n; - // The K dimension. It is the hidden dimension of the input matrices. - int32_t k; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // All-reduce parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The rank id of the current device in the multi-gpu space. - int rank; - // The number of peer devices in tensor-parallel group. - int tpGrpSize; - // Pointer for output with multicast mapping. It is used by the "reduce" op (LDGMC.ADD) of the - // two-shot reduce-scatter phase. - // The shape is [M, N] and the dtype is float. - void* multimemC; - - // The barriers in global memory. - // - // The kernel arrives at (with release ordering) the multicast mapping of the barrier to broadcast - // amongst peer devices. It then waits (with acquire ordering) for the unicast mapping of the - // barrier. - // - // Flags in global memory that sync on "entrance" of reduce-scatter phase in two-shot all-reduce. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the unicast memory created with IpcNvlsHandle. - // Must be set to 0 before the kernel launch. - void* ptrTileBars; - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the multicast memory created with IpcNvlsHandle. - void* multimemTileBars; - - // Flags in global memory that sync on "exit" after the all-reduce finishes. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the unicast memory created with IpcNvlsHandle. - // Must be set to 0 before the kernel launch. - void* ptrCompletionBars; - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the multicast memory created with IpcNvlsHandle - void* multimemCompletionBars; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Miscellaneous parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The barriers in global memory for Split-k reduction with exchange in GMEM. - // Each CTAs arrives at the barrier and blockIdx.z == gridDim.Z - 1 waits for the barrier to flip - // to perform a reduction. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // For DeepSeek FP8 recipe, the shape is [numTilesM * numTilesN * 2]. - // The memory must be set to 0 before the kernel launch. - void* ptrSplitKCompletionBars; - - // Pointer to the memory holding the partial sums for split-K in GMEM. - // The shape is [numSlicesForSplitK, numSlicesForSliceK, numTilesM * tileM, numTilesN * tileN]. - // The dtype is dtypeAcc, i.e. float. - void* ptrPartialSumsForSplitK; - - // In some cases, some CTAs need to exit early. E.g. when the grid is statically set, but the - // actual workload is decided at runtime. This device pointer maps to the number of non exiting - // CTAs in the X dim of the grid when transposeMmaOutput is false. And the Y dim, otherwise. - // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. - int32_t* ptrNumNonExitingCtas; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Miscellaneous parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - enum class MatrixType - { - MatrixA = 0, - MatrixB - }; -#endif -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace gemm - -} // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h index 3f3b915eee..9a4db96c7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h @@ -20,7 +20,6 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" #include -#include namespace gemm { @@ -78,38 +77,6 @@ public: } // Returns the offset of the ith chunk - int32_t getChunkOffsetByName(std::string const& name) const - { - for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) - { - if (mSmemChunkNames[ii] == name) - { - return getChunkOffset(ii); - } - } - throw std::runtime_error("Name not found: " + name); - } - - // Returns the first chunk reuse flag given chunk name. - int getFirstChunkReuseFlagByName(std::string const& name) const - { - for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) - { - if (mSmemChunkNames[ii] == name) - { - return getFirstChunkReuseFlag(ii); - } - } - throw std::runtime_error("Name not found: " + name); - } - - // Function to calculate the total size of the SMEM array - int32_t getTotalSize() const - { - return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); - } - -private: int32_t getChunkOffset(int32_t ii) const { if (mFirstChunkReuse[ii]) @@ -124,6 +91,12 @@ private: return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); } + // Function to calculate the total size of the SMEM array + int32_t getTotalSize() const + { + return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); + } + // Returns the first chunk reuse flag for the ith chunk. int getFirstChunkReuseFlag(int32_t ii) const { @@ -166,7 +139,9 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) { if (mmaKind == tg::MmaKind::Auto) { - throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); + std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl; + assert(false); + return -1; } if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) { @@ -187,12 +162,11 @@ public: KernelTraits() {} // The constructor. - KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA, - tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, - int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK, - int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput, - AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA, - bool usePerTokenSfB, BiasType biasType) + KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::MmaKind mmaKind, + int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, + int32_t numStagesMma, int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, + bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, + bool usePerTokenSfA, bool usePerTokenSfB) : mMmaKind{mmaKind} { // @@ -207,17 +181,16 @@ public: // [rowMax ] (16B aligned) (if needed) // [sliceK ] (16B aligned) (if needed) // [per-token SF ] (16B aligned) (if needed) - // [bias ] (16B aligned) (if needed) // // SMEM for smemA and smemB might be repurposed and used for gmemC0 and gmemC1: // // [..smemA..][..smemB..][..smemBShuffle..] - // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..][..per-token SF..][..bias..] + // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..] // if (mMmaKind == tg::MmaKind::Auto) { - mMmaKind = dtypeGetMmaKind(dtypeMmaA, dtypeMmaB); + mMmaKind = dtypeGetMmaKind(dtypeA, dtypeB); } std::vector> numBytesAndAlignmentPerSmemChunk; @@ -371,29 +344,6 @@ public: firstChunkReuseSmem.emplace_back(false); } - // Bias - { - int32_t numBytesSmemBias = 0; - if (isBiasTypeN(biasType)) - { - numBytesSmemBias = tileN * sizeof(float); - } - else if (isBiasTypeM(biasType)) - { - numBytesSmemBias = tileM * sizeof(float); - } - else if (isBiasTypeMn(biasType)) - { - numBytesSmemBias = tileM * tileN * sizeof(float); - } - // Number of bytes alignment for bias - auto const numBytesAlignmentBias = 16; - // Add info. - smemChunkNames.emplace_back("smemBias"); - numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); - firstChunkReuseSmem.emplace_back(false); - } - // Per-block absolute maximum for multi-warp reduction. { // Number of bytes: number of epilogue warps * number of tile columns. @@ -408,25 +358,6 @@ public: firstChunkReuseSmem.emplace_back(false); } - // SmemConstSfBuf - // A buffer used to copy constant values to TMEM. - { - // Do we need the buffer? - bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; - // Number of bytes for the buffer. - auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; - // Number of bytes for the alignment of the buffer. - auto const numBytesAlignmentConstSfBuf = 16; - // No need to reuse the first chunk. - auto const reuseChunksSmemConstSfBuf = false; - - // Add info. - smemChunkNames.emplace_back("smemConstSfBuf"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); - firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); - } - // Create SMEM helper object. mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); @@ -470,12 +401,10 @@ public: // Matrix A { - // We use TMEM for A if we use slice-K or if we need to cast A. - bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); // Number of columns for A. - auto const numTmemColsA = useTmemA ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) - : 0; + auto const numTmemColsA = numSlicesForSliceK > 1 ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeA)) + : 0; // Number of columns for A alignment. auto const numColsAlignmentA = 4; // No need to reuse TMEM. @@ -489,16 +418,12 @@ public: // Sf A { - // Does the MMA require block scales in TMEM for A? - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); - // Are the block scales constant? - bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeA); // Number of columns for scaling factors of A. - auto const numTmemColsSfA = useConstSfA - ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileM, 64), 4) - : (useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0); + auto const numTmemColsSfA + = useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0; // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 4; + auto const numColsAlignmentSfA = 2; // No need to reuse TMEM. auto const reuseChunksTmemSfA = false; @@ -510,16 +435,12 @@ public: // Sf B { - // Does the MMA require block scales in TMEM for B? - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); - // Are the block scales constant? - bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeB); // Number of columns for scaling factors of B. - auto const numTmemColsSfB = useConstSfB - ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileN, 64), 4) - : (useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0); + auto const numTmemColsSfB + = useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0; // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 4; + auto const numColsAlignmentSfB = 2; // No need to reuse TMEM. auto const reuseChunksTmemSfB = false; @@ -566,14 +487,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); + return traits.mSmemAllocatorHelper.getChunkOffset(0); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); + return traits.mSmemAllocatorHelper.getChunkOffset(1); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -587,63 +508,50 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); + return traits.mSmemAllocatorHelper.getChunkOffset(2); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); + return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); + return traits.mSmemAllocatorHelper.getChunkOffset(5); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); + return traits.mSmemAllocatorHelper.getChunkOffset(6); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -inline int32_t getSmemOffsetBias(KernelTraits traits) -{ - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); + return traits.mSmemAllocatorHelper.getChunkOffset(7); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) -{ - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); + return traits.mSmemAllocatorHelper.getChunkOffset(8); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); + // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -654,28 +562,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); + return traits.mTmemAllocatorHelper.getChunkOffset(0); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); + return traits.mTmemAllocatorHelper.getChunkOffset(1); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); + return traits.mTmemAllocatorHelper.getChunkOffset(2); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); + return traits.mTmemAllocatorHelper.getChunkOffset(3); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h index a246ac35b3..0b7574260e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h @@ -41,14 +41,14 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, - std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) + std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr, bool doSwizzle = true) { // The multiplication factor of the data padding in SMEM. int32_t padMultiplier = 1; CUtensorMap desc{}; // The data type. CUtensorMapDataType tmaDataFormat{CU_TENSOR_MAP_DATA_TYPE_FLOAT32}; - if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3 || dtype == tg::Dtype::UE8m0) + if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8; } @@ -71,11 +71,15 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st padMultiplier = 2; tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } + else if (mmaKind == tg::MmaKind::MxFp4NvFp4 || mmaKind == tg::MmaKind::Auto) + { + tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; + } else { - // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision - // type such as Bfloat16 before the MMA. - tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; + std::cerr << "Invalid dtype / mmaKind combination " << tg::dtypeToString(dtype) << "/" + << tg::mmaKindToString(mmaKind) << std::endl; + assert(false); } } else if (dtype == tg::Dtype::Fp32) @@ -90,30 +94,24 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; + int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; if (doSwizzle) { - if ((fastestDimTileSizeBytes % 128) == 0) + if ((tileKSizeInBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((fastestDimTileSizeBytes % 64) == 0) + else if ((tileKSizeInBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((fastestDimTileSizeBytes % 32) == 0) + else if ((tileKSizeInBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; - // This path is only for the scaling factors. - } - else if ((fastestDimTileSizeBytes % 16) == 0 && (dtype == tg::Dtype::UE8m0 || dtype == tg::Dtype::E4m3)) - { - swizzleType = CU_TENSOR_MAP_SWIZZLE_NONE; } else { - std::cerr << "buildNdTmaDescriptor: unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes - << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; assert(false); } } @@ -123,9 +121,8 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions for regular gemm, 3 dimensions for batched gemm or blocked layout, and 4 - // dimensions for batched gemm with blocked layout. - assert(dim == 2 || dim == 3 || dim == 4); + // Expect 2 dimensions. + assert(dim == 2 || dim == 3); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -150,74 +147,59 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); + auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); - // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. - assert(static_cast(tileShapes.size()) <= dim); - std::vector boxDim(dim, 1); - boxDim[0] = numEltsInClampedFastestTileSize; - for (size_t ii = 1; ii < tileShapes.size(); ++ii) - { - if (tileShapes[ii] > 256) - { - std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; - assert(false); - } - else - { - boxDim[ii] = tileShapes[ii]; - } - } + // Build tile shapes. + std::vector tileShapes(dim, 1); + tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK + tileShapes[1] = tileSizeMn; // tileSizeMn // Set tile strides to 1; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); if (result != CUDA_SUCCESS) { - char const* errorString; - cuGetErrorString(result, &errorString); - std::stringstream ss; - ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; + std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; - ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; + std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr + << std::endl; - ss << "Shape: "; + std::cerr << "Shape: "; for (int ii = 0; ii < dim; ++ii) { - ss << shapes[ii] << " "; + std::cerr << shapes[ii] << " "; } - ss << std::endl; + std::cerr << std::endl; - ss << "Stride: "; + std::cerr << "Stride: "; for (int ii = 0; ii < dim - 1; ++ii) { - ss << stridesInBytes[ii] << " "; + std::cerr << stridesInBytes[ii] << " "; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileShapes: "; + std::cerr << "tileShapes: "; for (int ii = 0; ii < dim; ++ii) { - ss << boxDim[ii] << " "; + std::cerr << tileShapes[ii] << " "; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileStrides: "; + std::cerr << "tileStrides: "; for (int ii = 0; ii < dim; ++ii) { - ss << tileStrides[ii] << " "; + std::cerr << tileStrides[ii] << " "; } - ss << std::endl; - ss << "swizzleType: " << int(swizzleType) << std::endl; - ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; - throw std::runtime_error(ss.str()); + std::cerr << std::endl; + std::cerr << "swizzleType: " << int(swizzleType) << std::endl; + assert(false); } return desc; @@ -285,44 +267,41 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - char const* errorString; - cuGetErrorString(result, &errorString); - std::stringstream ss; - ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; + std::cerr << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; - ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; + std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr + << std::endl; - ss << "shape:"; + std::cerr << "shape:"; for (uint32_t shape_i : shapes) { - ss << " " << shape_i; + std::cerr << " " << shape_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "stridesInBytes:"; + std::cerr << "stridesInBytes:"; for (uint32_t stride_i : stridesInBytes) { - ss << " " << stride_i; + std::cerr << " " << stride_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileShapes:"; + std::cerr << "tileShapes:"; for (uint32_t tileShape_i : tileShapes) { - ss << " " << tileShape_i; + std::cerr << " " << tileShape_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileStrides:"; + std::cerr << "tileStrides:"; for (uint32_t tileStride_i : tileStrides) { - ss << " " << tileStride_i; + std::cerr << " " << tileStride_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "swizzleType: " << int(swizzleType) << std::endl; - ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; - throw std::runtime_error(ss.str()); + std::cerr << "swizzleType: " << int(swizzleType) << std::endl; + assert(false); } return desc; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json index d502017fc2..fbbcdfa059 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json @@ -12,6 +12,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 4, + "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -42,6 +43,7 @@ "epilogueTileM": 128, "epilogueTileN": 128, "numStages": 3, + "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -73,6 +75,7 @@ "epilogueTileM": 64, "epilogueTileN": 8, "numStages": 3, + "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -102,6 +105,7 @@ "epilogueTileM": 128, "epilogueTileN": 128, "numStages": 3, + "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -131,6 +135,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, + "numMmaStages": 1, "numSlicesForSplitK": 2, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -177,6 +182,7 @@ "numStagesMma": 2, "numStagesMmaWithinWorkTile": 2, "useTwoMmaWarps": true, + "useMetaFp8": false, "usePdl": true }, "GemmDeepSeekFp8Throughput": { @@ -206,6 +212,7 @@ "numStagesMma": 2, "numStagesMmaWithinWorkTile": 2, "useTwoMmaWarps": true, + "useMetaFp8": false, "usePdl": true, "gridTriggerSecondaryA": true, "gridTriggerSecondaryB": false, @@ -225,6 +232,7 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, + "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -238,6 +246,7 @@ "useCustomMmaSchedule": true, "sfLayoutB": "8x4", "sfLayoutC": "8x4", + "useMetaFp8": false, "gridTriggerSecondaryB": true, "gridWaitForPrimaryA": false, "gridWaitForPrimaryB": true, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp new file mode 100644 index 0000000000..3d32c2ee25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:856ce9e462068d464a244eb5179277c6aeb4eba8c9767b354d664eb6eafee0d3 +size 416980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..1657a0701f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c461d5767472f619e7cffd41cc609bb9bf244b78342c55a1b42ae344ccc87292 +size 523680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..c21ee7f925 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d393b7e86991ea2757655b479ef75bfe660f3a1846f46c38e6f55c6ba9d6a25 +size 558316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp new file mode 100644 index 0000000000..2f35e76621 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ecd12d9d9e7d4cec0e7c530e72328420c868f37bab285ed55864776fc6eeec7 +size 304696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..15535b511a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff5b953f3226300d647adc3328d04fae0888b2de91f39a27f5ce7efc6f88f15e +size 401032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..6c80cc5381 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f921e229f0d48546a2087d02f526e0c5c8d5189696ec2e71349227182d1bee0 +size 438480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..2a9bb9cb19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1acf78f5c7f9505a95f782a4c781c94a9b34bb5958c8f511f32058da40f81868 +size 418890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..d191118f3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32f834741c1c2f721409b71b5aaa45b79e7d337c5fc422af33a1bbe1b56b3da5 +size 455548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..9708ab6fef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87785c72e84d52ff962f252e98868e1cc3f2595aaa1e9aaf2924fa50e886aba2 +size 458160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..00515a3f3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6055448d6d6cbb547b3d5656fecb5044465d88be3121e42e6b3c39f96e3bd828 +size 495608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..9f4c2f4187 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7b94d53bc5517b94f6c2c5c7e6108695a32809cbc55e7d83124f07c06a786c +size 426334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..4179145568 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64cf6bf660b14299c1957170426f089af93e19fccc93295fd32f0c5df77951d +size 463830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..bacabe8b3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9359453507f25d7d90bf1f6ac4a453756ba9f8006d6c76fcb3ff09a1ba8cf71a +size 305610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..76d4f50448 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52a9dc57d86dc176ad59234a764959c0ebec01d9738889a8989fbfca925cf72f +size 338088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..daef06303d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:631e364a1ae1a29d386f624849176118c5d4f7b01e38f7c973f190d89e7136f2 +size 506554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..5d0f3377e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d52dfd5ac422362ede96cbda888383f4452df7dc39d6653f8529a560d6b12d37 +size 687361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..adfbdfa4ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8ed23721a4d1eb0260e36dff2bad3cb1b603d287ef7584cf18c5db73ee869f +size 722835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp new file mode 100644 index 0000000000..354888f282 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12cea6d18fe0c95f6eb073e3956ca785835b2740ee65b7d8f934313f709cae87 +size 317072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..e52810e22e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5901f4239aff1497ca81fd9f853fdb35fb2a14ab35c89b1acbde9c87fc909da1 +size 423322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..005124d823 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db42b1e4d61dc69b6a5ac4304fb3e89eb04e9c858af216ee88f877099f400013 +size 460770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..f280d1b90d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8b1eb8b32d9211ed9f4842548e57128ee6ede550d4dcc3b6ac804de45a9f2b +size 460470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..c458bd9b23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70dc60c09fae6aa9bfa809107032a931b5edfc211d66b374b5092192c902c222 +size 497918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..3ac72b581f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae575f338e0db3bb053f9cfcee4804a75611b330b87718e18f7692f865ca6984 +size 538960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..9741082990 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c59fa6c8e96dedcb4956d1fbe3282ed1f809dcd34499bb9b4d7b06bc168b579d +size 575568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..0ef28473e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c6028820f760089f09b0a2e4b32fa0c76725f56ebdfd97fc6c53331616282d6 +size 438116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..516f77e48a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0760d0c99a44691f281310268a6751ea62292b2205719b285ff9e7429fcffaf1 +size 475614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..03aad1c232 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:173abead241a103af02a25daf432d028324561d57f64d9a6d2087cd444c45758 +size 304814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..02617fa32b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b924875fff3571d0efe3478f2fdddb11a8ec60796830f3dd172a7b4f24acac +size 335712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp new file mode 100644 index 0000000000..9c05d1e6c2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1a393cd48175f1cfc18436435ea638de675e0d98847510d97bd03d8bae234d +size 421756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..406e016af2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e20ad645aafe8ff45abd1c6c8d418c6d0f2c7cd38eb01d6969a32b7cea60de +size 528458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..f475229583 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b74282de9bcfced0c173d9aee838a54c8ba286bbc84719fcdb81dd9cbac7f15 +size 563882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp new file mode 100644 index 0000000000..c765b304d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9527b1cee4c45bf8d4346493fddb1b0f8dd3e6abf90ab358dfdf8864533a130 +size 309474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..96c0d7a7ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8669d845fb8bdacda6073d674ca9325a0b241591e1d020020b81977006cf71f +size 400234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..872c8bccec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c5e32724bfdbe9ac14ad6d54822778c6a1ec63772db04160719aaaed8d29f51 +size 438472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..3d70d3b63c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3339a4104298437c3628354d2b14431933314e9f2d2ecdce7b88865dccbee038 +size 418882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..d9d1236666 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4bfa0a6a96895cc2165a02f0c83f2627cf625ff6807a5be9c6f7bdbd210428 +size 456330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..340e167a2d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4fe726d14a219bb03ce7bd9bbdeeb5ea7e3195d64a4c1b9cf0018d53aa3df5 +size 459780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..d44d09ccfb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de6c8769b5814f20ca34ba8df001c795dc54ae81267488bb18510fe5d334f510 +size 498016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..59566f3bab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8b5e0bdece2e73f72e300afab464d0680aa00b9cbcbe08903ba0210780e149 +size 425536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp new file mode 100644 index 0000000000..44856c7481 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdb1d9d1b80de2668ddd79db5c491d3a768188b4b3f5c5d10efb335af7ff28ae +size 463034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..4328168985 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0835489e53c9690803dd1e44ec4290b6ddd9d4294bcb6c6d76b9f39680d12964 +size 304814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..e8a2a1a09b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520f1eed6f696ceaa3b40bc534a96120a1f01b616f6a93d60606b9e72a13c3d6 +size 336502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..0a252fa5e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a9fa4a70c46c56dbc57d38d6578b413554d5f24ee9bd9e223c9a14dca856db1 +size 504966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp new file mode 100644 index 0000000000..f5ac8a259d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e36ec5821ab21cdaa67e44755daaea4896b21d8324ad95b1227cd5060ba06df +size 421164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..61c6525168 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:675e4a797e20b6a606fa670c6a02d23dedc06affba9bedbe9d5eef4c9bf4fc28 +size 505954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..8d2a38bce3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51dd2e16aae507d655121a3869b4a9db92ed11a69d4a04f56d1716743740d38a +size 516120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp deleted file mode 100644 index 77d0a99b96..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10ab1fa6850215c55485f18f841da757a1c98a10702c69c218a816b36fdac81d -size 402540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index de8d77a2b4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:21882f276d02f03239844b86e520ac19ca9c02c102a94d4c1d3e0455d3fbc195 -size 511260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 25972befef..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6a8b4624db20a64ffe4e47af05fe23dfd1b4817948313331c31fa691335442e5 -size 542788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp deleted file mode 100644 index e1a577ebe2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa07f20606b725cb95d7c76c655b82818d06631839f0ec5665ed73485b87c3f8 -size 291096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 0a4798919c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a5dc8e158b5949c018d35bff4c3ad01d72f83df2ca286950345bc0a56b5b074 -size 388906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 1eb4c5dcbf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7754f52a7e19ccbed43cdb635195b35de68cae5dfbcabbe8426c3b9a074bbc8d -size 423592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 50a6697360..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f747338e56ab3db6a5d22ef7e869b9f6e00308583f160540460181e3e24e69d -size 405976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index f440cfd24d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e177fb082e81fefa33065c229c807924afb1c16124ecd166fdad0910bdf29971 -size 441450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 929eeb1dbd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb0c41ea0ea4277e8910fb3c6ec835e19762f864ce67d49515282316a7a07d89 -size 446034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index fcb1d856b6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1cc074235b2516200058ea1b6d33d9fa50985351d6fd337d4f7255ea3786b15 -size 480720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index db8bc1be45..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db41d7ec6fadab632ef27158aeb1016d6a1ba487d8cb24baba53777f19d726ce -size 414998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 100d47e3b7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4ca0d79c0b7c53167348509657267e4ccf29febfb2b0c929a1bad85ee8201c1 -size 449684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index e5c66f76ba..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82c9b1a2e6e9570f16b7e38a6570e138edf416433758da010a025550a7d1d83b -size 291990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index aa3f9888f9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a30b56272d60bbef5963e35770bd929156306170b275aae47f011b42bd2bf9e -size 296100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index e8ea278b38..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14bec10a3695e2db65fb8254a52bc3db54bf76fae50bc96920e191b02eb0c2a6 -size 482720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 4604422f32..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ad597736ad5068946e62334411abdf8e7f31327d31f1b7703b6b881964275ef -size 674989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index a0d62a251f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4ba8b4d74674c0ae8aed9354b28779c29be451a708ec9c6eb161591ffa3839e -size 707257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 4b3ae479c6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e73f3bdba906c7ce17bdecc7fb02b038a8e9315fed1784c3df011245416c99fc -size 411986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 3d2ab00446..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23a610adf01983ea06f09ecbdfe36ee6116751517171bc8e130403dbb3fc68ca -size 446672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index cd77c5b4e7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f3ddfdee3b39fe5c2d90c3ed85c44a33b83febb9d86793bb52235ec9f70e8b1 -size 449134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index a90de6422c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:062191382dfe7352ee19677a41b812f149818e7ccff61ab8cf6818ce372d1aea -size 483820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 39631961ee..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61c2df67fdf28e34db4a27504547a81de65590fc5dff1b6063bf8649dd2fc6df -size 525206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index f5bb5509bc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:740ea0a536f0047c05dbe66c91294857be694e1260e41713f1f1e0b5b37ee5ff -size 561470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 42d8202a61..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6b4e8b684058a352343892a43f2ceff21b645560c36cb5ed9ef5a62b5903add -size 425992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index fe8d9e4f3c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3d9a0badabc929e926cba4ad2f8f88d64225ee6501f883f8123d8d7b1938f0c5 -size 460676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index a907909b32..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c01197f1c4cab230b57a8b36a026038af238423432fce8e1fd883481d2935034 -size 289614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index 4c2b2a627d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14c5c5ce6f433c0a5706f0f353f4c45db650188fd1c7d07ef64b47d226a12bc1 -size 294514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp deleted file mode 100644 index d8b2ba966a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2e5b61a079c0a5a9ac8008e86155eea2261bc75bfaff0ac50cb14aec85418cd0 -size 408106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 97e546ee94..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca0c14f039c4f04866dec6567a530af9c50b6c55735a9de2be39c5061509dc91 -size 516826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 128565544d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d1436f956bca72e2d0299474607d229e5fae2d2095fb230eb77a0f024b0582d -size 548354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp deleted file mode 100644 index 87c6045e55..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a9c67d984671b707b7dbb116dbceca1e1ad9783b41654a12e86c296d74ff8ec -size 295874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index bb9cec8d50..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62f39085d942f91d6345e40d5fb8ad503da5bf10d5dc7cc14bdc8f4c81e84ab8 -size 388110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 1bab11accf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da82f5b1fa833552029337e9dc15065cb72b05ee9cb232aff3dcae5e9dccabe1 -size 423584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 2c819e4de0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da251f397db77fa29ad3d46b34d9cf5e26222faaa989132d4f647731b8ae93bf -size 405968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index beb2a10735..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7eec256e19ebf1224223cc00b9f1070f5f9609262a223417911c91613fa8169 -size 442232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index e2ad3277bf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a657c85ce833955d14af4fe2f7d7a064b203a81eef02ba58ec72dbef02d99be3 -size 447654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index 7f5dd60f15..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b2a3a197d059e3127c05eff531b841cb7664ea508a297fcca167e91f8be6e43 -size 483080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index ee86d503d0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c22c5844544b88950bcaa9ea162cb0d9d19d48bb33006e7d50d2f9974175d49c -size 413412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp deleted file mode 100644 index c1e8c2d985..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b01d19683cfc3ec1d38291cff6b96f430dda1ab0d0d7666af2766a6212d6f2c -size 448886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index a3b6d9c5f4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:736250708edba66db91c9f3672ab850ac53b90848c1cd6e232d064a1d9b5a930 -size 290404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index c83f197419..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e44ca67850f9c8872f061c045bbd59e99f7b76be5e1a946434a6b1c20da6af76 -size 295304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index 30109803c8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d67677108c5a5b7728ab83464d503b0fd54c6e5fdb46f2b1db301049b9c76ae -size 481134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp deleted file mode 100644 index 9165f1641c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67e6daf5493c8cff97092f7c180f0dca736bc7df3a51c1fd8c537c5c8fcf65f3 -size 406676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index b74bd86aed..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab06bf2b980dd36301380601ad2c366dd4369270869a71e8e8b7f61f2242e77a -size 482120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp deleted file mode 100644 index fd6b119a8b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ddc440fa5a729425dc0f453955dda44888657f73927698c8e14cb9a01dce4e1 -size 492288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp index 25eb9cd915..c5d5a18c00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp @@ -26,14 +26,14 @@ namespace tensorrt_llm { namespace kernels { -using namespace gemmGatedAct::gemmGatedAct; -static GemmGatedActInterface::ModuleCache globalTrtllmGenGemmGatedActModuleCache; + +static gemmGatedAct::GemmGatedActInterface::ModuleCache globalTrtllmGenGemmGatedActModuleCache; TrtllmGenGemmGatedActRunner::TrtllmGenGemmGatedActRunner(TrtllmGenGemmGatedActRunnerOptions const& options_) : mOptions(options_) { // Select a GEMM kernel config to use - auto const gemm = GemmGatedActInterface(); + auto const gemm = gemmGatedAct::GemmGatedActInterface(); auto const configs = gemm.getGemmConfigs(); mPassingConfigIndices.clear(); @@ -43,7 +43,7 @@ TrtllmGenGemmGatedActRunner::TrtllmGenGemmGatedActRunner(TrtllmGenGemmGatedActRu auto const options = configs[i].mOptions; // When we include low-latency kernels we can set transposeMmaOutput via constructor - if (options.mDtypeA == mOptions.eltType && options.mDtypeC == mOptions.outputType + if (options.mDtypeElt == mOptions.eltType && options.mDtypeC == mOptions.outputType && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8 && options.mTransposeMmaOutput == mOptions.transposeMmaOutput) { @@ -56,14 +56,14 @@ TrtllmGenGemmGatedActRunner::TrtllmGenGemmGatedActRunner(TrtllmGenGemmGatedActRu size_t TrtllmGenGemmGatedActRunner::getWorkspaceSizeInBytes(int32_t m, int32_t n, int32_t k) { - GemmGatedActData gemmData; + gemmGatedAct::GemmGatedActData gemmData; gemmData.mProblemDimensions.mM = mOptions.transposeMmaOutput ? n : m; gemmData.mProblemDimensions.mN = mOptions.transposeMmaOutput ? m : n; gemmData.mProblemDimensions.mK = k; selectGemmConfig(m, n, k); - auto gemm = GemmGatedActInterface(); + auto gemm = gemmGatedAct::GemmGatedActInterface(); auto const configs = gemm.getGemmConfigs(); TLLM_CHECK_WITH_INFO( mSelectedConfigIndex.has_value(), "No valid kernel found for given param config and problem size"); @@ -76,9 +76,9 @@ void TrtllmGenGemmGatedActRunner::run(int32_t m, int32_t n, int32_t k, void cons void const* b, float const* bScale, void* c, float* cScale, float* cScaleGate, void* workspace, CUstream stream, int device) { - auto gemm = GemmGatedActInterface(); + auto gemm = gemmGatedAct::GemmGatedActInterface(); - GemmGatedActData gemmData; + gemmGatedAct::GemmGatedActData gemmData; auto const configs = gemm.getGemmConfigs(); TLLM_CHECK_WITH_INFO( @@ -107,7 +107,7 @@ void TrtllmGenGemmGatedActRunner::run(int32_t m, int32_t n, int32_t k, void cons gemm.runInitBeforeWorldSync(config, gemmData, static_cast(stream)); auto const err = gemm.run(config, workspace, gemmData, static_cast(stream), multiProcessorCount, - /*usePdl=*/true, globalTrtllmGenGemmGatedActModuleCache); + globalTrtllmGenGemmGatedActModuleCache); TLLM_CHECK_WITH_INFO(err == 0, "Error occurred when running GEMM!"); } @@ -120,10 +120,10 @@ void TrtllmGenGemmGatedActRunner::run(int32_t m, int32_t n, int32_t k, void cons void TrtllmGenGemmGatedActRunner::selectGemmConfig(int32_t m, int32_t n, int32_t k) { - auto const gemm = GemmGatedActInterface(); + auto const gemm = gemmGatedAct::GemmGatedActInterface(); auto const configs = gemm.getGemmConfigs(); - GemmGatedActData gemmData; + gemmGatedAct::GemmGatedActData gemmData; // Dims gemmData.mProblemDimensions.mM = mOptions.transposeMmaOutput ? n : m; gemmData.mProblemDimensions.mN = mOptions.transposeMmaOutput ? m : n; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h index cbd6bada46..f7c30c9e0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h @@ -28,8 +28,8 @@ namespace kernels struct TrtllmGenGemmGatedActRunnerOptions { - gemmGatedAct::trtllm::gen::Dtype eltType; - gemmGatedAct::trtllm::gen::Dtype outputType; + trtllm::gen::Dtype eltType; + trtllm::gen::Dtype outputType; bool deepSeekFp8{false}; bool transposeMmaOutput{false}; }; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h index d1a31876f3..14c5d15b53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h @@ -18,9 +18,6 @@ #include -namespace gemmGatedAct -{ - namespace gemm { @@ -39,31 +36,6 @@ enum class AllReduceAlgo : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// -enum class MatrixLayout -{ - // K-major layout (default). [Mn, K] - MajorK = 0, - // M-major for A and N-major for B. [K, Mn] - MajorMn, - // Layout is blocked along the K dimension as seen in the diagram below. [K / blockK, Mn, blockK] - // where blockK is fixed at 128B - // - // ├────────────── K ──────────────┤ - // ┬ ┬ ├──── K block ───┤ - // │ │ │ 0 1 2 3 ║ 32 33 34 35 │ - // │ CTA0 │ 4 5 6 7 ║ 36 37 38 39 │ - // │ │ │ 8 9 10 11 ║ 40 41 42 43 │ - // │ ┴ │ 12 13 14 15 ║ 44 45 46 47 │ - // M ┬ ├────────────────║────────────────┤ - // │ │ │ 16 17 18 19 ║ 48 49 50 51 │ - // │ CTA1 │ 20 21 22 23 ║ 52 53 54 55 │ - // │ │ │ 24 25 26 27 ║ 56 57 58 59 │ - // ┴ ┴ │ 28 29 30 31 ║ 60 61 62 63 │ - BlockMajorK -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - enum class SplitK : uint32_t { // No split-k is needed. I.e. mNumSlicesForSplitK == 1. @@ -79,20 +51,6 @@ enum class SplitK : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// -enum class BiasType : uint32_t -{ - // No bias. - None = 0, - // One bias value per N of the output tensor. - M = 1, - // One bias value per row M of the output tensor. - N = 2, - // One bias value for each element of the output tensor. - Mn = 3, -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - enum class TileScheduler { // Static scheduler (Non-persistent). @@ -119,23 +77,4 @@ SPLIT_K_FUNCTION(Dsmem) //////////////////////////////////////////////////////////////////////////////////////////////////// -// Helper functions to check the Bias type. - -#define BIAS_TYPE_FUNCTION(Mode) \ - inline bool isBiasType##Mode(BiasType type) \ - { \ - return (type == BiasType::Mode); \ - } - -BIAS_TYPE_FUNCTION(None) -BIAS_TYPE_FUNCTION(N) -BIAS_TYPE_FUNCTION(M) -BIAS_TYPE_FUNCTION(Mn) - -#undef BIAS_TYPE_FUNCTION - -//////////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace gemm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h index f4cd7e2ad2..a8087dc59a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h @@ -17,7 +17,6 @@ #pragma once #include -#include #include "GemmGatedActOptions.h" #include "KernelParams.h" @@ -30,9 +29,6 @@ namespace gemmGatedAct { -namespace gemmGatedAct -{ - //////////////////////////////////////////////////////////////////////////////////////////////////// // // GemmGatedActData @@ -55,19 +51,14 @@ struct GemmGatedActData int32_t mK{0}; // The rank id of the current device in the multi-gpu space. int32_t mRank{0}; - // The number of devices in tensor-parallel group. + // The number of peer devices in tensor-parallel group. int32_t mWorldSize{0}; }; struct InputBuffers { - // The matrix A. The data type is controlled by options.mDtypeA. - // - // When layoutA is MatrixLayout::MajorK, the shape is [M, K]. - // When LayoutA is MatrixLayout::MajorMn, the shape is [K, M]. - // When LayoutA is MatrixLayout::BlockMajorK, the shape is [K / blockK, M, blockK] where blockK - // is 128B. - // The rightmost dimension is contiguous in memory. + // The matrix A. The data type is controlled by options.mDtypeElt. + // The shape is [M, K]. The rightmost dimension is contiguous in memory. void const* mPtrA{nullptr}; // The block scaling factors to dequantize A. @@ -100,13 +91,8 @@ struct GemmGatedActData // The shape is [M] void const* mPtrPerTokenSfA{nullptr}; - // The matrix B. The data type is controlled by options.mDtypeB. - // - // When layoutB is MatrixLayout::MajorK, the shape is [N, K]. - // When layoutB is MatrixLayout::MajorMn, the shape is [K, N]. - // When layoutB is MatrixLayout::BlockMajorK, the shape is [K / blockK, N, blockK] where blockK - // is 128B. - // The rightmost dimension is contiguous in memory. + // The matrix B. The data type is controlled by options.mDtypeElt. + // The shape is [N, K]. The rightmost dimension is contiguous in memory. void const* mPtrB{nullptr}; // The scaling factors to dequantize B. @@ -146,21 +132,6 @@ struct GemmGatedActData // The shape is [N] void const* mPtrPerTokenSfB{nullptr}; - // The bias applied after the GEMM and before the activation function. - // The bias is applied before the global scaling factor. I.e. - // C = act(A * B + bias') * scaleC - // scaleC = dequantA * dequantB * quantC - // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. - // - // if BiasType is N, the shape is [N] - // The bias is broadcasted along the M dimension. - // - // if BiasType is M, the shape is [M] - // The bias is broadcasted along the N dimension. - // - // The dtype is float32. - void const* mPtrBias{nullptr}; - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. @@ -169,43 +140,6 @@ struct GemmGatedActData // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. void const* mPtrScaleGate{nullptr}; - // The alpha for SwiGlu. - // Alpha is 1.f if nullptr. - // Shape is [1]. - void const* mPtrSwiGluAlpha{nullptr}; - // The beta for SwiGlu. - // Beta is 0.f if nullptr. - // Shape is [1]. - void const* mPtrSwiGluBeta{nullptr}; - // The clamp limit before the activation. - // Clamp limit is FLT_MAX if nullptr. - // When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb. - // Shape is [1]. - // - // The given clamp limit applies to the dequantized values, so the order of operations would - // look something like this: - // - // x0 = x0 * dqAb - // x0 = clamp(x0, none, limit) - // x0 = x0 * sigmoid(alpha * x0) - // x1 = dqAb * x1 - // x1 = clamp(x1, -limit, limit) - // out = qC * (x1 + beta) * x0 - // - // Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp - // limit and apply the clamping prior to dequantization: - // - // x0 = clamp(x0, none, limit / dqAb) - // x0 = x0 * dqAb - // x0 = x0 * sigmoid(alpha * x0) - // x1 = clamp(x1, -limit / dqAb, limit / dqAb) - // scaleC = dqAb * qC - // beta' = beta / dqAb - // out = scaleC * (x1 + beta') * x0 - // - // Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case - // - void const* mPtrClampLimit{nullptr}; }; struct OutputBuffers @@ -256,7 +190,7 @@ public: // Launch the cubin from the provided config. It calls all necessary memsets for internal buffers. // Provided config must be validated with isValidConfig before the call. int32_t run(GemmGatedActConfig const& config, void* workspace, GemmGatedActData const& data, void* cudaStream, - int32_t multiProcessorCount, bool usePdl = true, + int32_t multiProcessorCount, std::optional> moduleCache = std::nullopt) const; // Initializes the buffers before the world sync. Must be called before run. @@ -409,12 +343,8 @@ bool GemmGatedActInterface::isValidConfig(GemmGatedActConfig const& config, Gemm //////////////////////////////////////////////////////////////////////////////////////////////////// int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* workspace, GemmGatedActData const& data, - void* cudaStream, int32_t multiProcessorCount, bool usePdl, - std::optional> moduleCache) const + void* cudaStream, int32_t multiProcessorCount, std::optional> moduleCache) const { - // Might be used. - (void) usePdl; - (void) moduleCache; // Get options from config and data. auto options = getOptionsFromConfigAndData(config, data); @@ -443,12 +373,9 @@ int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* works // Create kernel params. auto kernelParams = gemmGatedAct::KernelParams::setKernelParams(options, data.mInputBuffers.mPtrA, data.mInputBuffers.mPtrSfA, data.mInputBuffers.mPtrPerTokenSfA, data.mInputBuffers.mPtrB, - data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mInputBuffers.mPtrBias, - data.mOutputBuffers.mPtrC, reinterpret_cast(data.mInputBuffers.mPtrScaleC), - data.mOutputBuffers.mPtrSfC, reinterpret_cast(data.mInputBuffers.mPtrScaleGate), - reinterpret_cast(data.mInputBuffers.mPtrClampLimit), - reinterpret_cast(data.mInputBuffers.mPtrSwiGluAlpha), - reinterpret_cast(data.mInputBuffers.mPtrSwiGluBeta), reinterpret_cast(dRowMax), + data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mOutputBuffers.mPtrC, + reinterpret_cast(data.mInputBuffers.mPtrScaleC), data.mOutputBuffers.mPtrSfC, + reinterpret_cast(data.mInputBuffers.mPtrScaleGate), reinterpret_cast(dRowMax), reinterpret_cast(dRowMaxBars)); // The size of the grid. @@ -468,26 +395,26 @@ int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* works #ifdef TLLM_GEN_EXPORT_INTERFACE CUmodule cuModule; CUfunction cuFunction; - if (moduleCache.has_value()) { ModuleCache& moduleCacheRef = moduleCache.value().get(); - // Modules are associated with a specific context, so the context is included in the key + // Modules are associated with a specific context so include the ctxId in the key CUcontext ctx; unsigned long long ctxId; cuCtxGetCurrent(&ctx); cuCtxGetId(ctx, &ctxId); - // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a - // string in decimal representation. + // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal + // representation. std::string const ctxName = std::string(reinterpret_cast(&ctxId), sizeof(unsigned long long) / sizeof(char)); std::string const funcName = std::string(config.mFunctionName); + // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision auto const moduleKey = ctxName + funcName; auto module = moduleCacheRef.find(moduleKey); - // Use cache if module is found, otherwise load and insert into cache + // Check if module exists in cache. Otherwise, load it if (module != moduleCacheRef.end()) { cuFunction = std::get<1>(module->second); @@ -517,9 +444,8 @@ int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* works // Run the kernel. auto result = trtllm::gen::launchKernel((void*) &kernelParams, cudaStream, config.mSharedMemSize, cuFunction, block3, grid3, cluster3, - usePdl - && (config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA - | config.mOptions.mGridWaitForPrimaryB)); + config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA + | config.mOptions.mGridWaitForPrimaryB); if (result != CUDA_SUCCESS) { return -1; @@ -548,5 +474,3 @@ int32_t GemmGatedActInterface::runInitBeforeWorldSync(GemmGatedActConfig const&, } // namespace gemmGatedAct //////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h index a6cf385a13..b23efd2774 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h @@ -48,9 +48,6 @@ namespace gemmGatedAct { -namespace gemmGatedAct -{ - //////////////////////////////////////////////////////////////////////////////////////////////////// namespace tg = trtllm::gen; @@ -58,16 +55,8 @@ namespace tg = trtllm::gen; // Type of the gated activation enum class ActType { - // For ActType == SwiGlu, ideally we would like to have something like - // gatedAct = scaleC * (x0 * scaleAb + beta) * ((x1 * scaleGate) * sigmoid(alpha * x1 * - // scaleGate)). - // But for now, we use the simplified version - // gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)), - // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales, - // beta' = beta / scaleAb, scaleC' = scaleC * scaleAb. - // - // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0. - SwiGlu + // silu(x) = x * sigmoid(x) = x * (1 / (1 + e^(-x))) + Silu = 0 }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -80,38 +69,24 @@ enum class ActType return (type == ActType::actType); \ } -TLLM_ACT_TYPE_FUNCTION(SwiGlu) +TLLM_ACT_TYPE_FUNCTION(Silu) #undef TLLM_ACT_TYPE_FUNCTION //////////////////////////////////////////////////////////////////////////////////////////////////// -inline std::string getActTypeName(ActType type) -{ - switch (type) - { - case ActType::SwiGlu: return "SwiGlu"; - default: return "Unknown type"; - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -struct GemmGatedActOptions : public gemm::GemmOptions +struct GemmGatedActOptions : virtual public gemm::GemmOptions { GemmGatedActOptions() = default; - GemmGatedActOptions(gemm::GemmOptions options, ActType actType, bool clampBeforeAct) + GemmGatedActOptions(gemm::GemmOptions const& options, ActType actType) : gemm::GemmOptions(options) , mActType(actType) - , mClampBeforeAct(clampBeforeAct) { } // Type of the gated activation. - ActType mActType{ActType::SwiGlu}; - // Clamp the dequantized values to the range [-limit, limit]. - bool mClampBeforeAct{false}; + ActType mActType{ActType::Silu}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -133,7 +108,7 @@ inline bool checkAndUpdateGemmGatedActOptions( if (options.mUseTmaStore) { - TLLM_CHECK_ERROR(hiddenEpilogueTileSize * tg::dtypeGetNumBits(options.mDtypeC) / /* bits */ 8 % 32 == 0, + TLLM_CHECK_ERROR(hiddenEpilogueTileSize * tg::dtypeGetNumBits(options.mDtypeElt) / /* bits */ 8 % 32 == 0, "Unsupported output hidden tile size"); } @@ -163,11 +138,6 @@ inline bool checkAndUpdateGemmGatedActOptions( TLLM_CHECK_ERROR(doesSplitKUseDsmem(options.mSplitK), "Split-k GMEM and GemmGatedAct are not supported yet."); } - if (gemm::isBiasTypeMn(options.mBiasType)) - { - TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "Bias type Mn is not supported with not transpose mma output."); - } - return true; } @@ -178,8 +148,7 @@ inline std::string dumpOptions(GemmGatedActOptions const& options) std::stringstream ss; ss << gemm::dumpOptions(options) << ", "; ss << "mActType=" - << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")," << std::endl; - ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl; + << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")" << std::endl; return ss.str(); } @@ -200,7 +169,6 @@ struct GemmGatedActConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; - char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -222,5 +190,3 @@ struct GemmGatedActConfig #undef TLLM_LOG_INFO #undef TLLM_LOG_ERROR #endif // TLLM_GEN_EXPORT_INTERFACE - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h index 367d68b971..24624ee0aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h @@ -23,7 +23,6 @@ #include "KernelParams.h" #include "KernelTraits.h" #include "trtllm/gen/DtypeDecl.h" -#include "trtllm/gen/MmaDecl.h" #include "trtllm/gen/SfLayoutDecl.h" #ifndef TLLM_GEN_EXPORT_INTERFACE #include "trtllm/gen/CudaRunner.h" @@ -34,14 +33,12 @@ template void printArgs(T first, Args... args) { -#ifdef TLLM_GEN_DEBUG std::cout << first; if constexpr (sizeof...(args) > 0) { std::cout << " "; printArgs(args...); } -#endif } #define TLLM_CHECK_ERROR(cond, ...) \ @@ -68,9 +65,6 @@ void printArgs(T first, Args... args) #endif -namespace gemmGatedAct -{ - namespace gemm { @@ -89,98 +83,9 @@ struct GemmOptions virtual ~GemmOptions() = default; #endif - GemmOptions() = default; - - GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, - int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, - tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, - int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, - bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, - bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, - MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, - bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, - int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, - bool patchF2fp, bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM, - int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, - bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, - bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, - tg::SfLayout sfLayoutC, int sfReshapeFactor, TileScheduler tileScheduler) - : mAllReduceAlgo{allReduceAlgo} - , mBiasType{biasType} - , mBlockK(blockK) - , mClusterDimX{clusterDimX} - , mClusterDimY{clusterDimY} - , mClusterDimZ{clusterDimZ} - , mDtypeAcc{dtypeAcc} - , mDtypeA{dtypeA} - , mDtypeB{dtypeB} - , mDtypeC{dtypeC} - , mDtypeMmaA{dtypeMmaA} - , mDtypeMmaB{dtypeMmaB} - , mEnablesEarlyExit{enablesEarlyExit} - , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} - , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} - , mEpilogueLdtmDps{epilogueLdtmDps} - , mEpilogueLdtmBits{epilogueLdtmBits} - , mEpilogueTileM{epilogueTileM} - , mEpilogueTileN{epilogueTileN} - , mGridTriggerSecondaryA{gridTriggerSecondaryA} - , mGridTriggerSecondaryB{gridTriggerSecondaryB} - , mGridWaitForPrimaryEarlyExit{gridWaitForPrimaryEarlyExit} - , mGridWaitForPrimaryA{gridWaitForPrimaryA} - , mGridWaitForPrimaryB{gridWaitForPrimaryB} - , mHoistLoadTaskInit{hoistLoadTaskInit} - , mHoistMmaTaskTryWaits{hoistMmaTaskTryWaits} - , mK{k} - , mKernelTraits{kernelTraits} - , mLayoutA{layoutA} - , mLayoutB{layoutB} - , mM{m} - , mMmaK{mmaK} - , mMmaKind{mmaKind} - , mMmaM{mmaM} - , mMmaN{mmaN} - , mMockAllReduce{mockAllReduce} - , mN{n} - , mNumSlicesForSplitK{numSlicesForSplitK} - , mNumSlicesForSliceK{numSlicesForSliceK} - , mNumStages{numStages} - , mNumStagesMma{numStagesMma} - , mNumStagesMmaWithinWorkTile{numStagesMmaWithinWorkTile} - , mNumStagesMmaAcrossWorkTile{numStagesMmaAcrossWorkTile} - , mNumStagesWorkId{numStagesWorkId} - , mOutputDebugTensors{outputDebugTensors} - , mPatchF2fp{patchF2fp} - , mUseShuffledMatrixA{useShuffledMatrixA} - , mSliceK{sliceK} - , mSplitK{splitK} - , mTransposeMmaOutput{transposeMmaOutput} - , mTileM{tileM} - , mTileN{tileN} - , mTileK{tileK} - , mUseUnrollLoop2xForMma{useUnrollLoop2xForMma} - , mUseCustomMmaSchedule{useCustomMmaSchedule} - , mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule} - , mUseDeepSeekFp8{useDeepSeekFp8} - , mUsePerTokenSfA{usePerTokenSfA} - , mUsePerTokenSfB{usePerTokenSfB} - , mUseTmaStore{useTmaStore} - , mUseTwoTmaLoadWarps{useTwoTmaLoadWarps} - , mUseTwoMmaWarps{useTwoMmaWarps} - , mSfLayoutA{sfLayoutA} - , mSfLayoutB{sfLayoutB} - , mSfLayoutC{sfLayoutC} - , mSfReshapeFactor{sfReshapeFactor} - , mTileScheduler{tileScheduler} - { - } - // The all-reduce algorithm. AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None}; - // The type of bias. - BiasType mBiasType{BiasType::None}; - // Block size in the K dimension - int mBlockK{-1}; + // Cluster size in X dim. int mClusterDimX{1}; // Cluster size in Y dim. @@ -189,34 +94,16 @@ struct GemmOptions int mClusterDimZ{1}; // Data type of the accumulators. tg::Dtype mDtypeAcc{tg::Dtype::Fp32}; - // Data type of the A matrix. - tg::Dtype mDtypeA{tg::Dtype::Fp16}; - // Data type of the B matrix. - tg::Dtype mDtypeB{tg::Dtype::Void}; + // Data type of the inputs. + tg::Dtype mDtypeElt{tg::Dtype::Fp16}; // Data type of the outputs. tg::Dtype mDtypeC{tg::Dtype::Void}; - // Data type of the A matrix for the MMA, if different from the input type. - tg::Dtype mDtypeMmaA{tg::Dtype::Void}; - // Data type of the B matrix for the MMA, if different from the input type. - tg::Dtype mDtypeMmaB{tg::Dtype::Void}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; - // Whether to enable delayed early exit to overlap - // numNonExitingCtas loading with the other instructions. + // Whether to enable early exit. bool mEnablesDelayedEarlyExit{false}; // Whether to enable the global PTX knobs for guiding the compiler optimizations. bool mEnablesGlobalPtxKnobs{true}; - // The epilogue supports multiple LDTM shapes, although not every shape is applicable in every - // case. In particular: - // - On Hopper: must be 16dp256bit. - // - Transposed output: must be 16dp256bit. - // - Non-transposed output: - // - NvFp4 with fused activation: must be 32dp32bit. - // - Else it can be either 16dp256bit or 32dp32bit. - // The number of DP lanes in the epilogue LDTM. - int mEpilogueLdtmDps{16}; - // The number of bits in the epilogue LDTM. - int mEpilogueLdtmBits{256}; // Tile size for the epilogue in M dimension. int mEpilogueTileM{128}; // Tile size for the epilogue in N dimension. @@ -231,24 +118,16 @@ struct GemmOptions bool mGridWaitForPrimaryA{true}; // Whether the load of B should wait on a grid dependency. bool mGridWaitForPrimaryB{true}; - // Whether to hoist the initialization of the loading tasks. - bool mHoistLoadTaskInit{true}; // Whether to hoist the mbarrier try_waits (e.g., mma.prodAcq, smemAb.consWait) in the MMA task. bool mHoistMmaTaskTryWaits{false}; // The K dimension of GEMM. int mK{16 * 16}; // Traits of the kernel. KernelTraits mKernelTraits{}; - // Layout of A matrix - MatrixLayout mLayoutA{MatrixLayout::MajorK}; - // Layout of B matrix - MatrixLayout mLayoutB{MatrixLayout::MajorK}; // The M dimension of GEMM. int mM{128 * 2}; // Size of the MMA instruction in the K dimension. int mMmaK{16}; - // The kind of MMA instruction to use. - tg::MmaKind mMmaKind{tg::MmaKind::Auto}; // Size of the MMA instruction in the M dimension. int mMmaM{64}; // Size of the MMA instruction in the N dimension. @@ -277,8 +156,6 @@ struct GemmOptions int mNumStagesWorkId{3}; // Whether to output debug tensors. bool mOutputDebugTensors{false}; - // Patch float conversions. - bool mPatchF2fp{false}; // Reorder rows/cols in the A matrix for the better memory accesses in the M-major epilogue. bool mUseShuffledMatrixA{false}; // Slice-K implementation to use TileM dimension for TileK. @@ -319,12 +196,6 @@ struct GemmOptions tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; // Scale factors layout for C. tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; - // Number of "repeats", i.e. reshaping factor, to fold hidden dimension into SfBlock dimension. - // As result, the hidden dimension of the SF tensor must be a multiple of NumRepeats * - // numEltsPerSf * 4. This reduces the problem shape space that the kernel is able to run. - // But it reduces the number of L2 requests under the hood and potentially improves perf. - // Applies to layout 8x4 only. - int mSfReshapeFactor{1}; // Tile scheduler type. TileScheduler mTileScheduler{TileScheduler::Static}; }; @@ -354,7 +225,6 @@ struct GemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; - char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -382,50 +252,27 @@ inline std::string toString(trtllm::gen::Dtype e) //////////////////////////////////////////////////////////////////////////////////////////////////// -template <> -inline std::string toString(trtllm::gen::MmaKind e) -{ - return trtllm::gen::mmaKindToString(e); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - inline std::string dumpOptions(GemmOptions const& options) { std::stringstream ss; ss << "mAllReduceAlgo=" << "gemm::AllReduceAlgo(" << static_cast(options.mAllReduceAlgo) << ")" << "," << std::endl; - ss << "mBiasType=" - << "gemm::BiasType(" << static_cast(options.mBiasType) << ")" - << "," << std::endl; - ss << "mBlockK=" << options.mBlockK << "," << std::endl; ss << "mClusterDimX=" << options.mClusterDimX << "," << std::endl; ss << "mClusterDimY=" << options.mClusterDimY << "," << std::endl; ss << "mClusterDimZ=" << options.mClusterDimZ << "," << std::endl; ss << "mDtypeAcc=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeAcc) << ")" << "," << std::endl; - ss << "mDtypeA=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeA) << ")" - << "," << std::endl; - ss << "mDtypeB=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeB) << ")" + ss << "mDtypeElt=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeElt) << ")" << "," << std::endl; ss << "mDtypeC=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeC) << ")" << "," << std::endl; - ss << "mDtypeMmaA=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaA) << ")" - << "," << std::endl; - ss << "mDtypeMmaB=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" - << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; - ss << "mEpilogueLdtmDps=" << options.mEpilogueLdtmDps << "," << std::endl; - ss << "mEpilogueLdtmBits=" << options.mEpilogueLdtmBits << "," << std::endl; ss << "mEpilogueTileM=" << options.mEpilogueTileM << "," << std::endl; ss << "mEpilogueTileN=" << options.mEpilogueTileN << "," << std::endl; ss << "mGridTriggerSecondaryA=" << options.mGridTriggerSecondaryA << "," << std::endl; @@ -433,20 +280,12 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mGridWaitForPrimaryEarlyExit=" << options.mGridWaitForPrimaryEarlyExit << "," << std::endl; ss << "mGridWaitForPrimaryA=" << options.mGridWaitForPrimaryA << "," << std::endl; ss << "mGridWaitForPrimaryB=" << options.mGridWaitForPrimaryB << "," << std::endl; - ss << "mHoistLoadTaskInit=" << options.mHoistLoadTaskInit << "," << std::endl; ss << "mHoistMmaTaskTryWaits=" << options.mHoistMmaTaskTryWaits << "," << std::endl; ss << "mK=" << options.mK << "," << std::endl; ss << "mKernelTraits={}" << "," << std::endl; - ss << "mLayoutA=gemm::MatrixLayout(" << static_cast(options.mLayoutA) << ")" - << "," << std::endl; - ss << "mLayoutB=gemm::MatrixLayout(" << static_cast(options.mLayoutB) << ")" - << "," << std::endl; ss << "mM=" << options.mM << "," << std::endl; ss << "mMmaK=" << options.mMmaK << "," << std::endl; - ss << "mMmaKind=" - << "trtllm::gen::MmaKind(" << static_cast(options.mMmaKind) << ")" - << "," << std::endl; ss << "mMmaM=" << options.mMmaM << "," << std::endl; ss << "mMmaN=" << options.mMmaN << "," << std::endl; ss << "mMockAllReduce=" << options.mMockAllReduce << "," << std::endl; @@ -459,7 +298,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mNumStagesMmaAcrossWorkTile=" << options.mNumStagesMmaAcrossWorkTile << "," << std::endl; ss << "mNumStagesWorkId=" << options.mNumStagesWorkId << "," << std::endl; ss << "mOutputDebugTensors=" << options.mOutputDebugTensors << "," << std::endl; - ss << "mPatchF2fp=" << options.mPatchF2fp << "," << std::endl; ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl; ss << "mSliceK=" << options.mSliceK << "," << std::endl; ss << "mSplitK=" @@ -487,7 +325,6 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mSfLayoutC=" << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" << "," << std::endl; - ss << "mSfReshapeFactor=" << options.mSfReshapeFactor << "," << std::endl; ss << "mTileScheduler=" << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" << std::endl; return ss.str(); @@ -503,14 +340,6 @@ inline T divUp(T a, T b) //////////////////////////////////////////////////////////////////////////////////////////////////// -template -inline T divUpMul(T a, T b) -{ - return gemm::divUp(a, b) * b; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - inline int32_t getShuffleBlockSize(int epilogueTileM) { int shuffleBlockSize = 16; @@ -527,136 +356,10 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) inline bool checkAndUpdateGemmOptions( GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) { - - if (options.mDtypeB == tg::Dtype::Void) + if (options.mDtypeElt == tg::Dtype::E4m3 && options.mMmaK != 32) { - if (updateOptions) - { - options.mDtypeB = options.mDtypeA; - } - else - { - return false; - } - } - - // If not specified, used the input dtypes as MMA dtypes (no cast required). - if (options.mDtypeMmaA == tg::Dtype::Void) - { - if (updateOptions) - { - options.mDtypeMmaA = options.mDtypeA; - } - else - { - return false; - } - } - if (options.mDtypeMmaB == tg::Dtype::Void) - { - if (updateOptions) - { - options.mDtypeMmaB = options.mDtypeB; - } - else - { - return false; - } - } - - // Check that the A cast is supported. - // Currently, we only support {MxFp4, NvFp4} -> Bf16. - TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) - || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) - && options.mDtypeMmaA == tg::Dtype::Bfloat16) - || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), - "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); - - // Check that the B cast is supported. - // Currently, we only support Fp8 -> MxFp8. - // TODO: add same support for A (no transpose) - TLLM_CHECK_ERROR((options.mDtypeB == options.mDtypeMmaB) - || (options.mDtypeB == tg::Dtype::E4m3 && options.mDtypeMmaB == tg::Dtype::MxE4m3), - "Unsupported cast for B: ", tg::dtypeToString(options.mDtypeB), " -> ", tg::dtypeToString(options.mDtypeMmaB)); - - if (options.mDtypeA != options.mDtypeMmaA) - { - TLLM_CHECK_ERROR(options.mTileM == 128, "TileM must be 128 when casting the input matrix A before the MMA."); - } - - if (options.mPatchF2fp) - { - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE2m1 && options.mDtypeMmaA == tg::Dtype::Bfloat16, - "PatchF2fp is only supported for MxFp4 to Bf16 casts."); - } - - // FIXME: We do not support different dtypes for A and B when not on Blackwell. - if (!isBlackwell) - { - TLLM_CHECK_ERROR( - options.mDtypeMmaA == options.mDtypeMmaB, "For non-Blackwell, A and B must have the same dtype."); - } - - // Check that the different dtypes for A and B are supported by the tensor core - // kind::f8f6f4 - if (options.mDtypeMmaA == tg::Dtype::E4m3 || options.mDtypeMmaA == tg::Dtype::E2m1) - { - TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1, - "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1."); - } - - // kind::mxf8f6f4 - if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1) - { - TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1, - "For dtypeMmaA = MxE4m3 or MxE2m1, dtypeMmaB must also be MxE4m3 or MxE2m1."); - } - if (options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1) - { - TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1, - "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1."); - } - - // kind::f16 - if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16) - { - TLLM_CHECK_ERROR(options.mDtypeMmaB == options.mDtypeMmaA, - "For dtypeMmaA = Fp16/Bfloat16, dtypeMmaB must be the same as dtypeMmaA."); - } - - // When one of the inputs needs to be cast, we must use two load warps. - if ((options.mDtypeMmaA != options.mDtypeA || options.mDtypeMmaB != options.mDtypeB) - && !options.mUseTwoTmaLoadWarps) - { - TLLM_LOG_WARNING("Two TMA load warps must be enabled if any of the inputs needs to be cast."); - } - - // When different dtypes are used for A and B, we must use different tiles to do the loading. - // It is not strictly required, but current implementation of SmemAb requires that. - if (options.mDtypeA != options.mDtypeB) - { - TLLM_CHECK_ERROR( - options.mUseTwoTmaLoadWarps, "Two TMA load warps must be enabled for different input types of A and B."); - } - - // Get the mma kind for the input types. - if (options.mMmaKind == tg::MmaKind::Auto) - { - if (updateOptions) - { - options.mMmaKind = dtypeGetMmaKind(options.mDtypeMmaA, options.mDtypeMmaB); - } - else - { - return false; - } - } - - if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) - && options.mMmaK != 32) - { - TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for MmaKind=", gemm::toString(options.mMmaKind), - ". Setting MmaK to 32"); + TLLM_LOG_WARNING( + "Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt), ". Setting MmaK to 32"); if (updateOptions) { options.mMmaK = 32; @@ -668,42 +371,15 @@ inline bool checkAndUpdateGemmOptions( } } - // Check LDTM shape. - if (isBlackwell) - { - TLLM_CHECK_ERROR((options.mEpilogueLdtmDps == 16 && options.mEpilogueLdtmBits == 256) - || (options.mEpilogueLdtmDps == 32 && options.mEpilogueLdtmBits == 32), - "Unsupported LDTM shape: ", options.mEpilogueLdtmDps, "dp", options.mEpilogueLdtmBits, "bit."); - if (options.mEpilogueTileM == 64) - { - TLLM_CHECK_ERROR(options.mEpilogueLdtmDps == 16, - "Unsupported LDTM shape for epilogueTileM=64: ", options.mEpilogueLdtmDps, "dp", - options.mEpilogueLdtmBits, "bit."); - } - if (options.mTransposeMmaOutput) - { - // We can't use 32dp32bit LDTM for transposed outputs because we need each thread to own - // multiple consecutive output elements. - TLLM_CHECK_ERROR((options.mEpilogueLdtmDps == 16 && options.mEpilogueLdtmBits == 256), - "Only 16dp256bit LDTM is supported for transposed outputs."); - } - } - else - { - TLLM_CHECK_ERROR(options.mEpilogueLdtmDps == 16 && options.mEpilogueLdtmBits == 256, - "Hopper does not use TMEM. The register layout corresponds to 16dp256bit. Got ", options.mEpilogueLdtmDps, - "dp", options.mEpilogueLdtmBits, "bit."); - } - // Constraints for NvFp4 and MxFp8. - if ((options.mMmaKind == tg::MmaKind::MxFp4NvFp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4 + if ((options.mDtypeElt == tg::Dtype::E2m1 || options.mDtypeElt == tg::Dtype::MxE4m3 || options.mDtypeC == tg::Dtype::MxE4m3) && options.mMmaM != 128) { // MMA M must be 128 when the input uses block scaling, or when the output is an Mx format. int newTileM = 128 * divUp(options.mTileM, 128); - TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for MmaKind=", gemm::toString(options.mMmaKind), - ". Setting MmaM to 128 and TileM to ", newTileM); + TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for dtypeElt=", gemm::toString(options.mDtypeElt), + ", dtypeC=", gemm::toString(options.mDtypeC), ". Setting MmaM to 128 and TileM to ", newTileM); if (updateOptions) { options.mMmaM = 128; @@ -714,15 +390,18 @@ inline bool checkAndUpdateGemmOptions( return false; } } - if (options.mMmaKind == tg::MmaKind::MxFp4NvFp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) + if (options.mDtypeElt == tg::Dtype::E2m1 || options.mDtypeElt == tg::Dtype::MxE4m3) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); - int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32; + TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4, + "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + + int const mmaK = (options.mDtypeElt == tg::Dtype::E2m1) ? 64 : 32; if (options.mMmaK != mmaK) { int newTileK = mmaK * divUp(options.mTileK, mmaK); - TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for MmaKind=", gemm::toString(options.mMmaKind), + TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt), ". Setting MmaK to ", mmaK, " and TileK to ", newTileK); if (updateOptions) { @@ -735,56 +414,18 @@ inline bool checkAndUpdateGemmOptions( } } - // The MMA N may only be smaller than 64 if it is equal to the tile N. - TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, - ") must be >= 64 or equal to TileN (", options.mTileN, ")"); - } - if (tg::dtypeIsBlockFmt(options.mDtypeA)) - { - int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); - auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; - TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, - ") must be a multiple of 4"); - } - if (tg::dtypeIsBlockFmt(options.mDtypeB)) - { - TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 - || options.mSfLayoutB == tg::SfLayout::Linear, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); - // TileN must be a multiple of the number of rows per SF tile. int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsB == 0, "TileN (", options.mTileN, ") must be a multiple of ", numSfTileRowsB, " for B SF layout ", tg::sfLayoutToString(options.mSfLayoutB)); + // The MMA N may only be smaller than 64 if it is equal to the tile N. + TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, + ") must be >= 64 or equal to TileN (", options.mTileN, ") for ", gemm::toString(options.mDtypeElt)); - int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfB) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSfB), " for typeB ", gemm::toString(options.mDtypeB)); - auto const numEltsPerSfBInK = options.mK / numEltsPerSfB; - TLLM_CHECK_ERROR(numEltsPerSfBInK % 4 == 0, "K dimension of scaling factors for B (", numEltsPerSfBInK, - ") must be a multiple of 4"); + int numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeElt); + TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSf) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (4 * numEltsPerSf), " for type ", gemm::toString(options.mDtypeElt)); } - - int32_t padMultiplierA = 1; - int32_t padMultiplierB = 1; - if (options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) - { - if (options.mDtypeA == tg::Dtype::MxE2m1) - { - padMultiplierA = 2; - } - if (options.mDtypeB == tg::Dtype::MxE2m1) - { - padMultiplierB = 2; - } - } - TLLM_CHECK_ERROR((padMultiplierA * tg::dtypeGetNumBits(options.mDtypeA) * options.mK / 8) % 16 == 0, - "K dimension of A must be aligned to 16 bytes."); - TLLM_CHECK_ERROR((padMultiplierB * tg::dtypeGetNumBits(options.mDtypeB) * options.mK / 8) % 16 == 0, - "K dimension of B must be aligned to 16 bytes."); - if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); @@ -792,10 +433,8 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mSfLayoutC == tg::SfLayout::R128c4 || options.mSfLayoutC == tg::SfLayout::R8c4, "Only the 128x4 and 8x4 SF layouts are supported for C."); int const numSfTileRowsC = options.mSfLayoutC == tg::SfLayout::R128c4 ? 128 : 8; - int const tileTokenDim = options.mTransposeMmaOutput ? options.mTileN : options.mTileM; - TLLM_CHECK_ERROR_FMT(tileTokenDim % numSfTileRowsC == 0, - "Tile%s (%d) must be a multiple of %d for C SF layout %s", options.mTransposeMmaOutput ? "N" : "M", - tileTokenDim, numSfTileRowsC, tg::sfLayoutToString(options.mSfLayoutC).c_str()); + TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsC == 0, "TileN (", options.mTileN, ") must be a multiple of ", + numSfTileRowsC, " for C SF layout ", tg::sfLayoutToString(options.mSfLayoutC)); int const hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC); @@ -808,10 +447,10 @@ inline bool checkAndUpdateGemmOptions( // If dtypeC is unspecified (Dtype::Void), assign to the input dtype. if (options.mDtypeC == tg::Dtype::Void) { - TLLM_LOG_INFO("Setting dtypeC to ", tg::dtypeToString(options.mDtypeA)); + TLLM_LOG_INFO("Setting dtypeC to ", tg::dtypeToString(options.mDtypeElt)); if (updateOptions) { - options.mDtypeC = options.mDtypeA; + options.mDtypeC = options.mDtypeElt; } else { @@ -879,6 +518,10 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mM > 0 && options.mN > 0 && options.mK > 0, "M, N and K must be larger than 0"); TLLM_CHECK_ERROR(options.mNumSlicesForSplitK > 0, "Split K must be larger than 0."); + TLLM_CHECK_ERROR(options.mK % options.mNumSlicesForSplitK == 0, "K must be divisible by NumSlicesForSplitK."); + TLLM_CHECK_ERROR((options.mK / options.mNumSlicesForSplitK) % options.mTileK == 0, + "K / NumSlicesForSplitK must be divisible by TileK. Found TileK=", options.mTileK, " and K=", options.mK, + " and NumSlicesForSplitK=", options.mNumSlicesForSplitK); if (options.mUseShuffledMatrixA) { @@ -887,10 +530,8 @@ inline bool checkAndUpdateGemmOptions( shuffleBlockSize, ") when useShuffledMatrixA"); } - if (!options.mSliceK) - { - TLLM_CHECK_ERROR(options.mMmaM <= options.mEpilogueTileM, "EpilogueTileM must be larger or equal than mmaM."); - } + TLLM_CHECK_ERROR(options.mMmaM <= options.mEpilogueTileM && options.mMmaN <= options.mEpilogueTileN, + "EpilogueTileM and EpilogueTileN must be larger or equal than the respective atom sizes."); TLLM_CHECK_ERROR(options.mTileM % options.mEpilogueTileM == 0 && options.mTileN % options.mEpilogueTileN == 0, "TileM and TileN must be divisible by EpilogueTileM and EpilogueTileN respectively."); TLLM_CHECK_ERROR( @@ -1036,25 +677,19 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR( options.mNumStagesMmaWithinWorkTile == 1, "Non-DeepSeekFp8 requires numStagesMmaWithinWorkTile == 1"); - if (options.mNumStagesMma > 1) - { - TLLM_CHECK_ERROR(options.mTileScheduler == TileScheduler::Persistent, - "Non-DeepSeekFp8 requires persistent scheduler when using numStagesMma >1"); - } } if (options.mUseDeepSeekFp8) { - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, - "A and B dtype must be E4m3 for DeepSeek Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), - " dtypeB=", tg::dtypeToString(options.mDtypeB)); + TLLM_CHECK_ERROR(options.mDtypeElt == tg::Dtype::E4m3, "A and B dtype must be E4m3 for DeepSeek Fp8. Found ", + tg::dtypeToString(options.mDtypeElt)); TLLM_CHECK_ERROR(isBlackwell, "DeepSeek Fp8 is not supported for Hopper"); TLLM_CHECK_ERROR(options.mAllReduceAlgo == AllReduceAlgo::None, "DeepSeek Fp8 does not support AllReduce"); // Check that TileK = 128 for correct scaling of every 128 channels. TLLM_CHECK_ERROR(options.mTileK == 128, "Tile-K must be equal to 128 for DeepSeek Fp8"); - TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); // Tile sizes of the output hidden dimension. + auto hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; auto hiddenDimPerOutputTile = options.mTransposeMmaOutput ? options.mTileM : options.mTileN; auto hiddenDimPerEpilogueTile = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; auto hiddenDimPerMma = options.mTransposeMmaOutput ? options.mMmaM : options.mMmaN; @@ -1067,6 +702,9 @@ inline bool checkAndUpdateGemmOptions( // Use two MMA warps to reduce mbar trywait latency. TODO: enable by default for deepseek. // options.mUseTwoMmaWarps = true; + // Make sure the GEMM-M/N dimension is a multiple of 128 when using DeepSeek FP8. + TLLM_CHECK_ERROR(hiddenDim % 128 == 0, "GEMM-", hiddenDimName, + " must be a multiple of 128 when using DeepSeek Fp8. Found ", hiddenDim); // Make sure the GEMM-K dimension is a multiple of 128 when using DeepSeek FP8. TLLM_CHECK_ERROR( options.mK % 128 == 0, "GEMM-K must be a multiple of 128 when using DeepSeek Fp8. Found ", options.mK); @@ -1094,32 +732,25 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mTileN == options.mEpilogueTileN, "TileN must be equal to EpilogueTileN for slice-K"); TLLM_LOG_WARNING("Overwriting TileM and EpilogueTileM to 32 for slice-K"); - if (options.mTileM != 32 || options.mEpilogueTileM != 32) + if (updateOptions) { - if (updateOptions) - { - // FIXME: it is possible to remove this restriction. - options.mTileM = 32; - options.mEpilogueTileM = 32; - } - else - { - return false; - } + // FIXME: it is possible to remove this restriction. + options.mTileM = 32; + options.mEpilogueTileM = 32; } - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, - "Slice-K requires e4m3 input dtype"); - - if (options.mNumSlicesForSliceK != 4) + else { - if (updateOptions) - { - options.mNumSlicesForSliceK = 4; - } - else - { - return false; - } + return false; + } + TLLM_CHECK_ERROR(options.mDtypeElt == tg::Dtype::E4m3, "Slice-K requires e4m3 input dtype"); + + if (updateOptions) + { + options.mNumSlicesForSliceK = 4; + } + else + { + return false; } TLLM_CHECK_ERROR((options.mTileK / options.mMmaK) % options.mNumSlicesForSliceK == 0, "TileK (", options.mTileK, ") / MmaK (", options.mMmaK, ") must be a multiple of mNumSlicesForSliceK (", options.mNumSlicesForSliceK, @@ -1128,22 +759,14 @@ inline bool checkAndUpdateGemmOptions( if (options.mUseUnrollLoop2xForMma) { - // Number of iterations in K dimension after padding. - // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations. - // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is - // - // ceil(512 / (128*3)) * (128*3) = 768 - // - int paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK); - // Check that the padded K (K rounded to next multiple of tileK) is a multiple of 2*TileK when - // UnrollLoop2x is enabled. This is to avoid deadlock when mma runs even-numbered loop while the - // other warps run odd-numbered loop. - // - bool notSupported = (paddedK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; + bool notSupported = (options.mK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; + // Check that the 2*TileK is a multiple of MmaK when UnrollLoop2x is enabled. + // This is to avoid deadlock when mma runs even-numbered loop while the other warps run + // odd-numbered loop. if (notSupported) { TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=", options.mTileK, - " and K=", options.mK, " (paddedK=", paddedK, ") and numSlicesForSplitK=", options.mNumSlicesForSplitK, + " and K=", options.mK, " and numSlicesForSplitK=", options.mNumSlicesForSplitK, ". Disabling unrollLoop2xForMma."); if (updateOptions) { @@ -1198,108 +821,23 @@ inline bool checkAndUpdateGemmOptions( // // Kernel 1: ----PREEXIT-----------FLUSH // Kernel 2: -------PREEXIT----ACQBULK---FLUSH - // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible - // ----------------------- - // Warp 1: ---- (!) We normally assume that 1 is visible is not yet - // visible- Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible - // ---------- + // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible ----------------------- + // Warp 1: ---- (!) We normally assume that 1 is visible is not yet visible- + // Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible ---------- TLLM_CHECK_ERROR((options.mGridWaitForPrimaryA || !options.mGridTriggerSecondaryA), "A: If a task triggers a secondary kernel, it must also wait for primary kernel."); TLLM_CHECK_ERROR((options.mGridWaitForPrimaryB || !options.mGridTriggerSecondaryB), "B: If a task triggers a secondary kernel, it must also wait for primary kernel."); - if (options.mUsePerTokenSfA || options.mUsePerTokenSfB) - { - // Checks applicable to both MetaFP8 and RoutingScalesOnInput - TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "DeepSeek FP8 and per-token scaling are not compatible"); - TLLM_CHECK_ERROR(isBlackwell, "Per-token scaling is not supported for Hopper"); - if (options.mUsePerTokenSfA && options.mUsePerTokenSfB) - { - // MetaFP8 case - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, - "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), - " dtypeB=", tg::dtypeToString(options.mDtypeB)); - } - else - { - // RoutingScalesOnInput case - TLLM_CHECK_ERROR((options.mUsePerTokenSfA && !options.mTransposeMmaOutput) - || (options.mUsePerTokenSfB && options.mTransposeMmaOutput), - "In RoutingScalesOnInput mode, perToken scales must be used on activations"); - } - } - - // The generation should support non K-major layouts for both A and B; however, it is unclear if - // there is a use-case - TLLM_CHECK_ERROR((options.mLayoutA == MatrixLayout::MajorK) || (options.mLayoutB == MatrixLayout::MajorK), - "At least one matrix must be in k-major layout"); - - // Some features are currently only support when both matrices are in K-major format - if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) - { - TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell"); - TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K"); - } - if (options.mLayoutA == MatrixLayout::MajorMn) - { - TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeA) >= 8, "Subbyte types only support K major layout"); - } - if (options.mLayoutB == MatrixLayout::MajorMn) - { - TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeB) >= 8, "Subbyte types only support K major layout"); - } - - if ((options.mLayoutA == MatrixLayout::BlockMajorK) || (options.mLayoutB == MatrixLayout::BlockMajorK)) - { - bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; - - // Block K size must be 128B. - // TODO Leaving this as an option for now in case we want to expertiment with other block sizes - // As the user is not expected to set this, do not fail if updateOptions is false - int32_t const elemSizeInBits - = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); - int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; - - if (options.mBlockK != elemsIn128B) - { - if (updateOptions) - { - options.mBlockK = elemsIn128B; - } - else - { - return false; - } - } - - if (options.mBlockK > options.mTileK) - { - TLLM_CHECK_ERROR(options.mBlockK % options.mTileK == 0, - "If block size is greater than tile size, block size must be a multiple of tile size"); - } - else if (options.mBlockK < options.mTileK) - { - TLLM_CHECK_ERROR(options.mTileK % options.mBlockK == 0, - "If tile size is greater than block size, tile size must be a multiple of block size"); - } - } - - if (!isBiasTypeNone(options.mBiasType)) - { - TLLM_CHECK_ERROR(!isBiasTypeMn(options.mBiasType), "BiasType::Mn is not supported"); - TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "Bias is not supported for DeepSeek Fp8"); - TLLM_CHECK_ERROR(!(options.mUsePerTokenSfA && options.mUsePerTokenSfB), "Bias is not supported for Meta Fp8"); - } - if (updateOptions) { // Init kernel traits. - options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc, - options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, - options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, - options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, - options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, - options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType); + options.mKernelTraits = KernelTraits(options.mDtypeElt, options.mDtypeC, options.mDtypeAcc, options.mTileM, + options.mTileN, options.mTileK, options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, + options.mNumStagesMma, options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, + options.mUseTmaStore, options.mTransposeMmaOutput, options.mAllReduceAlgo, + options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8, options.mUsePerTokenSfA, + options.mUsePerTokenSfB); } return true; @@ -1319,5 +857,3 @@ inline bool checkAndUpdateGemmOptions( #undef TLLM_LOG_ERROR #endif // TLLM_GEN_EXPORT_INTERFACE - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h index 67ff778421..e4f9b89c93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h @@ -19,88 +19,74 @@ #include "GemmGatedActOptions.h" -namespace gemmGatedAct -{ - namespace tensorrt_llm { namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "051000ea" -#define TLLM_GEN_EXPORT_VERSION "7.0.3.0" +#define TLLM_GEN_COMMIT "23d32a5" +#define TLLM_GEN_EXPORT_VERSION "0.0" static constexpr size_t tllmGenGemmGatedActListLen = 13; #ifndef EXCLUDE_SM_100 -extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { #ifndef EXCLUDE_SM_100 -{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 86016, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "96e542e09e16b9927422afec4c16854cc29a142e7c1fee7dc9bbffd33f7eb87f", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 +{GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(17826819) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -113,7 +99,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -121,7 +106,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -133,28 +118,19 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin_len, 168960, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a", 224, "168780ebf86e7d79779a5b52b605e72d29de0fb8abc4bc1f38f040817b986607", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len, 168960, "gemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 , /* mGridTriggerSecondaryA */ 0 @@ -162,15 +138,11 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 @@ -183,7 +155,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -203,44 +174,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 112640, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "1a459a19a7d6e925c8f27bd9d4946ccbbf9758f4dedd799b658680d076bbfd21", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 112640, "gemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -253,7 +211,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -273,44 +230,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 110592, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "e7914f7ecee3c15cdc90053c78218e6549e0107cbfd15c38426ed2ca72398e5e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 110592, "gemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) , /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -323,7 +267,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -343,44 +286,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 86016, "gemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "bc417f58a40b4ba68730e0dc1ff3a5343f883ffbf18c1056c9c1a7ebd4951595", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(17826819) +, /* mDtypeC */ trtllm::gen::Dtype(17826819) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -393,7 +323,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -401,7 +330,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -413,28 +342,19 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin_len, 152576, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a", 224, "ca3f4ea86ec01d7d6dbbebb3f5eb1248685403ea253e44931d0d01c8d24c81bf", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len, 218112, "gemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeC */ trtllm::gen::Dtype(1050630) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 , /* mGridTriggerSecondaryA */ 0 @@ -442,28 +362,23 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 +, /* mNumStages */ 3 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -483,44 +398,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 111616, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "f5f90e279514f6d8322964082bb1b019acfc8d3a596b5ed382fc6bb9e8a23608", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 111616, "gemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeC */ trtllm::gen::Dtype(1050630) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -533,7 +435,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -553,44 +454,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 110592, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "41477079616a5969c5e3521238b1ec32d8f6a09680324b7a732d144ff3b18d07", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 110592, "gemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeC */ trtllm::gen::Dtype(1050630) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -603,7 +491,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -623,44 +510,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 86016, "gemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "754cf37e731c40683a27223bc2e0c3bd8dfb39b308f0ef16a9e9d857479f000e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(17826819) +, /* mDtypeC */ trtllm::gen::Dtype(1052680) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -673,7 +547,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -681,7 +554,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -693,28 +566,19 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin_len, 168960, "gemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a", 224, "76ccade87557dfae73216d5531b6ef8ed098af960daca62ec7eabac3d1fb72df", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len, 168960, "gemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeC */ trtllm::gen::Dtype(1052680) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 128 , /* mGridTriggerSecondaryA */ 0 @@ -722,15 +586,11 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 @@ -743,7 +603,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -763,44 +622,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 112640, "gemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "394e76b48c40fd0514773ca4078cf01274e2d0bff804bd176bab44d2a832021b", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 112640, "gemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeC */ trtllm::gen::Dtype(1052680) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -813,7 +659,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -833,44 +678,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 110592, "gemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "22842a61b00e9b98e5d2435e53779c43140031479c75f1a7a9fae5673d3a9d50", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 110592, "gemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052679) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeC */ trtllm::gen::Dtype(1052680) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -883,7 +715,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -903,44 +734,31 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, -{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 86016, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "feb2bb2aa2e88f8f3dfd8de985601474e5147b926b6d067884ed67730eda4b66", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 + }, gemm::SmVersion::Sm100a }, +{GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(17826818) -, /* mDtypeB */ trtllm::gen::Dtype(17826818) -, /* mDtypeC */ trtllm::gen::Dtype(1056776) -, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) -, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) +, /* mDtypeElt */ trtllm::gen::Dtype(17826819) +, /* mDtypeC */ trtllm::gen::Dtype(1056777) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 0 +, /* mGridTriggerSecondaryB */ 1 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryA */ 0 , /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 -, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 @@ -953,7 +771,6 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -961,7 +778,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -973,14 +790,11 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) -, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) -, /* mClampBeforeAct */ 0 - }, gemm::SmVersion::Sm100a}, + }, gemm::SmVersion::Sm100a }, #endif // EXCLUDE_SM_100 }; // clang-format on } // namespace kernels } // namespace tensorrt_llm -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h index dc6c9a928b..4a7bde2a17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h @@ -26,9 +26,6 @@ namespace gemmGatedAct { -namespace gemmGatedAct -{ - //////////////////////////////////////////////////////////////////////////////////////////////////// namespace tg = trtllm::gen; @@ -37,7 +34,7 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& shapes, - std::vector const& strides, std::vector const& tileShapes, void* gmemAddr) + std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr) { CUtensorMap desc{}; // The data type. @@ -70,22 +67,22 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype)) / /* bits */ 8; - if ((fastestDimTileSizeBytes % 128) == 0) + int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype)) / /* bits */ 8; + if ((tileKSizeInBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((fastestDimTileSizeBytes % 64) == 0) + else if ((tileKSizeInBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((fastestDimTileSizeBytes % 32) == 0) + else if ((tileKSizeInBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; } else { - std::cerr << "Unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes << std::endl; + std::cerr << "Unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; assert(false); } @@ -94,8 +91,8 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions for regular gemm or 3 dimensions for blocked layout - assert(dim == 2 || dim == 3); + // Expect 2 dimensions. + assert(dim == 2); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -120,31 +117,19 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); + auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); - // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. - assert(static_cast(tileShapes.size()) <= dim); - std::vector boxDim(dim, 1); - boxDim[0] = numEltsInClampedFastestTileSize; - for (size_t ii = 1; ii < tileShapes.size(); ++ii) - { - if (tileShapes[ii] > 256) - { - std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; - assert(false); - } - else - { - boxDim[ii] = tileShapes[ii]; - } - } + // Build tile shapes. + std::vector tileShapes(dim, 1); + tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK + tileShapes[1] = tileSizeMn; // tileSizeMn // Set tile strides to 0; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); @@ -155,34 +140,10 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - - std::cerr << "Shape: "; - for (int ii = 0; ii < dim; ++ii) - { - std::cerr << shapes[ii] << " "; - } - std::cerr << std::endl; - - std::cerr << "Stride: "; - for (int ii = 0; ii < dim - 1; ++ii) - { - std::cerr << stridesInBytes[ii] << " "; - } - std::cerr << std::endl; - - std::cerr << "tileShapes: "; - for (int ii = 0; ii < dim; ++ii) - { - std::cerr << boxDim[ii] << " "; - } - std::cerr << std::endl; - - std::cerr << "tileStrides: "; - for (int ii = 0; ii < dim; ++ii) - { - std::cerr << tileStrides[ii] << " "; - } - std::cerr << std::endl; + std::cerr << "Shape: " << shapes[0] << " " << shapes[1] << std::endl; + std::cerr << "Stride: " << stridesInBytes[0] << std::endl; + std::cerr << "tileShapes: " << tileShapes[0] << " " << tileShapes[1] << std::endl; + std::cerr << "tileStrides: " << tileStrides[0] << " " << tileStrides[1] << std::endl; std::cerr << "swizzleType: " << int(swizzleType) << std::endl; assert(false); } @@ -204,54 +165,16 @@ struct KernelParams // TMA descriptor for A. // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If layoutA is MatrixLayout::MajorK - // Logical shape is [M, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileM, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeA. - // - // If layoutA is MatrixLayout::MajorMn - // Logical shape is [K, M]. - // Logical strides are [M, 1]. - // Tile box shape is [tileK, tileM]. - // Tile box strides are [tileM, 1]. - // Dtype is set from options.mDtypeA. - // - // If layoutA is MatrixLayout::BlockMajorK - // Logical shape is [K / blockK, M, blockK]. - // Logical strides are [M * blockK, blockK, 1]. - // Tile box shape is [tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. - // Tile box strides are [tileM * min(blockK, tileK), min(blockK, tileK), 1]. - // Dtype is set from options.mDtypeA, and blockK is 128B. + // makeTmaShapeStrideAb. Logical shape is [M, K]. Logical strides are [K, 1]. Tile box shape is + // [tileM, tileK]. Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeElt. CUtensorMap tmaA; // TMA descriptor for B. // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If layoutB is MatrixLayout::MajorK - // Logical shape is [N, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileN, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeB. - // - // If layoutB is MatrixLayout::MajorMn - // Logical shape is [K, N]. - // Logical strides are [N, 1]. - // Tile box shape is [tileK, tileN]. - // Tile box strides are [tileN, 1]. - // Dtype is set from options.mDtypeB. - // - // If layoutB is MatrixLayout::BlockMajorK - // Logical shape is [K / blockK, N, blockK]. - // Logical strides are [N * blockK, blockK, 1]. - // Tile box shape is [tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. - // Tile box strides are [tileN * min(blockK, tileK), min(blockK, tileK), 1]. - // Dtype is set from options.mDtypeB, and blockK is 128B. + // makeTmaShapeStrideAb. Logical shape is [N, K]. Logical strides are [K, 1]. Tile box shape is + // [tileN, tileK]. Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeElt. CUtensorMap tmaB; // TMA descriptor for C, (when useTmaStore is true) @@ -313,21 +236,21 @@ struct KernelParams // When transposeMmaOutput is true, the shape is [N, M / 2]. // Otherwise, the shape is [M, N / 2]. // Elements in a given row are stored contiguously in memory (row-major). - void* ptrC{nullptr}; + void* ptrC; // The scaling factors to dequantize A. // It is used when the DeepSeek FP8 recipe is enabled. Otherwise should be set to nullptr. // If transposeMmaOutput is false, shape is [K / 128, M]. // Otherwise, shape is [M / 128, K / 128]. // The rightmost dimension is contiguous in memory. - void const* ptrSfA{nullptr}; + void const* ptrSfA; // The scaling factors to dequantize B. // It is used when the DeepSeek FP8 recipe is enabled. Otherwise should be set to nullptr. // If transposeMmaOutput is false, shape is [N / 128, K / 128]. // Otherwise, shape is [K / 128, N]. // The rightmost dimension is contiguous in memory. - void const* ptrSfB{nullptr}; + void const* ptrSfB; // The per-token scaling factors from scale A. // @@ -337,7 +260,7 @@ struct KernelParams // transposed). The dtype is Dtype::Bfloat16 // // The shape is [M] - void const* ptrPerTokenSfA{nullptr}; + void const* ptrPerTokenSfA; // The per-token scaling factors from scale B. // @@ -347,22 +270,7 @@ struct KernelParams // transposed). The dtype is Dtype::Bfloat16 // // The shape is [N] - void const* ptrPerTokenSfB{nullptr}; - - // The bias applied after the GEMM and before the activation function. - // The bias is applied before applying the global scaling factor. I.e. - // C = act(A * B + bias') * scaleC - // scaleC = dequantA * dequantB * quantC - // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. - // - // if BiasType is N, the shape is [N] - // The bias is broadcasted along the M dimension. - // - // if BiasType is M, the shape is [M] - // The bias is broadcasted along the N dimension. - // - // The dtype is float32. - void const* ptrBias{nullptr}; + void const* ptrPerTokenSfB; // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also // used for the DeepSeek FP8 recipe. @@ -376,7 +284,7 @@ struct KernelParams // If transposeMmaOutput is false, shape is [M, N / 2 / 16]. // Otherwise, shape is [N, M / 2 / 16]. // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). - void* ptrSfC{nullptr}; + void* ptrSfC; // Output is equal to // y = act(ptrScaleGate[0] * y1) * (ptrScaleC[0] * y2) @@ -384,29 +292,10 @@ struct KernelParams // The output tensor scaling factor for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. - float const* ptrScaleC{nullptr}; + float const* ptrScaleC; // The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. // Shape is [1]. - float const* ptrScaleGate{nullptr}; - - // The clamp limit before the activation. - // Shape is [1]. - // Clamp is INF if nullptr. - // If applied on SwiGlu, it will be: - // - // x_glu = x_glu.clamp(min=None, max=limit) - // x_linear = x_linear.clamp(min=-limit, max=limit) - float const* ptrClampLimit{nullptr}; - - // The alpha and beta for SwiGlu. - // Shape is [1]. One alpha and one beta per tensor in batch. - // Alpha is 1.f if nullptr. - // Beta is 0.f if nullptr. - // The formula: - // - // out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta) - float const* ptrSwiGluAlpha{nullptr}; - float const* ptrSwiGluBeta{nullptr}; + float const* ptrScaleGate; // The M dimension. // It is the total number of tokens if A is the activation matrix. @@ -439,12 +328,12 @@ struct KernelParams // Pointer for partial row max for DeepSeek FP8 recipe. // This is temporary storage for the row max results. // The shape is [2, M, N / 128] and the dtype is float. - float* ptrPartialRowMax{nullptr}; + float* ptrPartialRowMax; // Flags in global memory that sync on "exit" for row max computation. // The size is numTilesM * numTilesN / 2 and the dtype is uint32_t. // The memory must be set to 0 before the kernel launch. - uint32_t* ptrRowMaxCompletionBars{nullptr}; + uint32_t* ptrRowMaxCompletionBars; enum class MatrixType { @@ -457,24 +346,13 @@ struct KernelParams template static auto makeTmaShapeStrideAbc(GemmOptions const& options, MatrixType matrixType) { - // The outer dimension. auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? options.mM : options.mN; - // The outer dimension tile size. - auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM - : (matrixType == MatrixType::MatrixA) ? options.mTileM - : options.mTileN; - // The inner dimension. auto hiddenSize = (matrixType == MatrixType::MatrixC) ? options.mN / 2 : options.mK; - // The inner dimension tile size. - auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN / 2 : options.mTileK; - // Swap variables if transpose output if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) { numTokens = options.mN; hiddenSize = options.mM / 2; - tileNumTokens = options.mEpilogueTileN; - tileHiddenSize = options.mEpilogueTileM / 2; } // The cute tensor shape for A/B: (numTokens, hiddenSize). // Note that TMA descriptor expects the first dimension's stride to be @@ -485,41 +363,12 @@ struct KernelParams // Swap the first two dimension as mentioned before. auto stride = std::vector{1, static_cast(hiddenSize)}; - // Assemble the box shape - std::vector tileShape = {tileHiddenSize, tileNumTokens}; - - // Alternate layouts do not apply to matrixC - if (matrixType != MatrixType::MatrixC) - { - gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; - if (layout == gemm::MatrixLayout::MajorMn) - { - // Apply transpose if necessary - std::swap(shape[0], shape[1]); - stride[1] = numTokens; - std::swap(tileShape[0], tileShape[1]); - } - else if (layout == gemm::MatrixLayout::BlockMajorK) - { - // Set shapes based on blocking layout - shape = {static_cast(options.mBlockK), static_cast(numTokens), - static_cast(options.mK / options.mBlockK)}; - stride - = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK)}; - - // If blockK > tileK, then the inner most box size will be based on the tile - int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize); - tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK}; - } - } - - return std::make_tuple(shape, stride, tileShape); + return std::make_tuple(shape, stride); } // Create the TMA shape/stride for A/B block scaling factors. template - static auto makeTmaShapeStrideSfAb( - GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout, int sfReshapeFactor) + static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) { // The outer dimension. auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; @@ -529,10 +378,8 @@ struct KernelParams auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; // The inner tile dimension. auto hiddenSizePerTile = options.mTileK; - // The dtype of the matrix. - tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; // Number of elements per scaling factor. - int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; + int32_t const numEltsPerSf = (options.mDtypeElt == tg::Dtype::E2m1) ? 16 : 32; switch (layout) { @@ -570,36 +417,15 @@ struct KernelParams { // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. // - // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use - // fewer read requests, if the tile dimensions allow. It does not reduce the number of - // instructions. - // + // As the inner dimension (k) is required to be a multiple of the tile size, we + // can reshape to use fewer read requests, if the tile dimensions allow. // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] - // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] - // - // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of - // NumRepeats * numEltsPerSf * 4. + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf * r), r * 32] - // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. - int const r = sfReshapeFactor; - if (r > 0 && (r & (r - 1)) != 0) - { - throw std::runtime_error( - "mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); - } - - // Sanitize number of repeats so it doesn't exceed the dimension. - int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); - - // Detect if the input hidden size K is a multiple of the repeats. - if (tg::ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) - { - throw std::runtime_error("SF hiddenSize K (" + std::to_string(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)) - + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); - } + int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), 8); auto shape = std::vector{static_cast(repeats * 32), static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), @@ -619,7 +445,7 @@ struct KernelParams return std::make_tuple(shape, stride, tileShapes); } - default: throw std::runtime_error("Unsupported SF layout"); + default: assert(false); } return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); } @@ -627,43 +453,38 @@ struct KernelParams // Setup the kernel parameters. template static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, - void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, - void const* ptrBias, void* ptrC, float const* ptrScaleC, void* ptrSfC, float const* ptrScaleGate, - float const* ptrClampLimit, float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, float* rowMax, - uint32_t* rowMaxBars) + void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void* ptrC, + float const* ptrScaleC, void* ptrSfC, float const* ptrScaleGate, float* rowMax, uint32_t* rowMaxBars) { // Create the return struct. KernelParams params; // Shape/stride for gmem tensor A. - auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, MatrixType::MatrixA); + auto [shapeA, strideA] = makeTmaShapeStrideAbc(options, MatrixType::MatrixA); // Build tma descriptor for A. - params.tmaA - = gemmGatedAct::buildNdTmaDescriptor(options.mDtypeA, shapeA, strideA, tileShapeA, const_cast(ptrA)); + params.tmaA = gemmGatedAct::buildNdTmaDescriptor( + options.mDtypeElt, shapeA, strideA, options.mTileM, options.mTileK, const_cast(ptrA)); // Shape/stride for gmem tensor B. - auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, MatrixType::MatrixB); + auto [shapeB, strideB] = makeTmaShapeStrideAbc(options, MatrixType::MatrixB); // Build tma descriptor for B. - params.tmaB - = gemmGatedAct::buildNdTmaDescriptor(options.mDtypeB, shapeB, strideB, tileShapeB, const_cast(ptrB)); + params.tmaB = gemmGatedAct::buildNdTmaDescriptor( + options.mDtypeElt, shapeB, strideB, options.mTileN, options.mTileK, const_cast(ptrB)); - if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3) + if (options.mDtypeElt == tg::Dtype::E2m1 || options.mDtypeElt == tg::Dtype::MxE4m3) { - tg::Dtype const dTypeSf = tg::dtypeGetBlockSfType(options.mDtypeA); + tg::Dtype const dTypeSf = tg::dtypeGetBlockSfType(options.mDtypeElt); // Build TMA descriptor for gmem A block scaling factors. auto [shapeSfA, strideSfA, tileShapesSfA] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4, options.mSfReshapeFactor); + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); params.tmaSfA = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); - } - if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3) - { - tg::Dtype const dTypeSf = tg::dtypeGetBlockSfType(options.mDtypeB); + // Build TMA descriptor for gmem B block scaling factors. auto [shapeSfB, strideSfB, tileShapesSfB] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB, options.mSfReshapeFactor); + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); params.tmaSfB = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); } @@ -671,10 +492,14 @@ struct KernelParams if (options.mUseTmaStore) { // Shape/stride for gmem tensor C. - auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, MatrixType::MatrixC); + auto [shapeC, strideC] = makeTmaShapeStrideAbc(options, MatrixType::MatrixC); + + // Swap M and N tiles for the M-major epilogue. + auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; + auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; // Build tma descriptor for C. params.tmaC = gemmGatedAct::buildNdTmaDescriptor( - options.mDtypeC, shapeC, strideC, tileShapeC, const_cast(ptrC)); + options.mDtypeC, shapeC, strideC, outputTileM, outputTileN / 2, const_cast(ptrC)); } params.ptrC = ptrC; @@ -687,13 +512,8 @@ struct KernelParams params.ptrPerTokenSfA = ptrPerTokenSfA; params.ptrPerTokenSfB = ptrPerTokenSfB; - params.ptrBias = ptrBias; - params.ptrScaleC = ptrScaleC; params.ptrScaleGate = ptrScaleGate; - params.ptrClampLimit = ptrClampLimit; - params.ptrSwiGluAlpha = ptrSwiGluAlpha; - params.ptrSwiGluBeta = ptrSwiGluBeta; params.rank = 0; params.tpGrpSize = 1; @@ -712,5 +532,3 @@ struct KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemmGatedAct - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h index 34189eebb0..1c3d4581c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h @@ -20,10 +20,6 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" #include -#include - -namespace gemmGatedAct -{ namespace gemm { @@ -78,38 +74,6 @@ public: } // Returns the offset of the ith chunk - int32_t getChunkOffsetByName(std::string const& name) const - { - for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) - { - if (mSmemChunkNames[ii] == name) - { - return getChunkOffset(ii); - } - } - throw std::runtime_error("Name not found: " + name); - } - - // Returns the first chunk reuse flag given chunk name. - int getFirstChunkReuseFlagByName(std::string const& name) const - { - for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) - { - if (mSmemChunkNames[ii] == name) - { - return getFirstChunkReuseFlag(ii); - } - } - throw std::runtime_error("Name not found: " + name); - } - - // Function to calculate the total size of the SMEM array - int32_t getTotalSize() const - { - return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); - } - -private: int32_t getChunkOffset(int32_t ii) const { if (mFirstChunkReuse[ii]) @@ -124,6 +88,12 @@ private: return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); } + // Function to calculate the total size of the SMEM array + int32_t getTotalSize() const + { + return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); + } + // Returns the first chunk reuse flag for the ith chunk. int getFirstChunkReuseFlag(int32_t ii) const { @@ -162,24 +132,6 @@ private: //////////////////////////////////////////////////////////////////////////////////////////////////// -int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) -{ - if (mmaKind == tg::MmaKind::Auto) - { - throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); - } - if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) - { - return 8; - } - else - { - return tg::dtypeGetNumBits(dtype); - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - class KernelTraits { public: @@ -187,13 +139,11 @@ public: KernelTraits() {} // The constructor. - KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA, - tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, - int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK, - int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput, - AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA, - bool usePerTokenSfB, BiasType biasType) - : mMmaKind{mmaKind} + KernelTraits(tg::Dtype dtypeElt, tg::Dtype dtypeC, tg::Dtype dtypeAcc, int32_t tileM, int32_t tileN, int32_t tileK, + int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, + int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, + bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, + bool usePerTokenSfA, bool usePerTokenSfB) { // // SMEM @@ -207,19 +157,13 @@ public: // [rowMax ] (16B aligned) (if needed) // [sliceK ] (16B aligned) (if needed) // [per-token SF ] (16B aligned) (if needed) - // [bias ] (16B aligned) (if needed) // // SMEM for smemA and smemB might be repurposed and used for gmemC0 and gmemC1: // // [..smemA..][..smemB..][..smemBShuffle..] - // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..][..per-token SF..][..bias..] + // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..] // - if (mMmaKind == tg::MmaKind::Auto) - { - mMmaKind = dtypeGetMmaKind(dtypeMmaA, dtypeMmaB); - } - std::vector> numBytesAndAlignmentPerSmemChunk; std::vector firstChunkReuseSmem; // Buffer names for inspection purposes. @@ -228,8 +172,7 @@ public: // LoadA { // Number of bytes in load A shared memory. - auto const numSmemBytesLoadA - = numStages * tileM * tileK * getNumSmemBitsPerElt(dtypeA, mMmaKind) / 8 /* bits */; + auto const numSmemBytesLoadA = numStages * tileM * tileK * tg::dtypeGetNumBits(dtypeElt) / 8 /* bits */; // Number of bytes for load A alignment for TMA load. auto const numBytesAlignmentLoadA = 1024; // loadA is already at first chunk. No need to reuse it. @@ -244,8 +187,7 @@ public: // LoadB { // Number of bytes in load B shared memory. - auto const numSmemBytesLoadB - = numStages * tileN * tileK * getNumSmemBitsPerElt(dtypeB, mMmaKind) / 8 /* bits */; + auto const numSmemBytesLoadB = numStages * tileN * tileK * tg::dtypeGetNumBits(dtypeElt) / 8 /* bits */; // Number of bytes for load B alignment for TMA load. auto const numBytesAlignmentLoadB = 1024; // No need to reuse the first chunk. @@ -265,7 +207,7 @@ public: { // Number of bytes in save shuffled B in shared memory. auto const numSmemBytesLoadB = numSlicesForSliceK > 1 - ? numStages * tileN * tileK * getNumSmemBitsPerElt(dtypeB, mMmaKind) / 8 /* bits */ + ? numStages * tileN * tileK * tg::dtypeGetNumBits(dtypeElt) / 8 /* bits */ : 0; // Number of bytes for load B alignment for TMA load. auto const numBytesAlignmentLoadB = 1024; @@ -371,29 +313,6 @@ public: firstChunkReuseSmem.emplace_back(false); } - // Bias - { - int32_t numBytesSmemBias = 0; - if (isBiasTypeN(biasType)) - { - numBytesSmemBias = tileN * sizeof(float); - } - else if (isBiasTypeM(biasType)) - { - numBytesSmemBias = tileM * sizeof(float); - } - else if (isBiasTypeMn(biasType)) - { - numBytesSmemBias = tileM * tileN * sizeof(float); - } - // Number of bytes alignment for bias - auto const numBytesAlignmentBias = 16; - // Add info. - smemChunkNames.emplace_back("smemBias"); - numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); - firstChunkReuseSmem.emplace_back(false); - } - // Per-block absolute maximum for multi-warp reduction. { // Number of bytes: number of epilogue warps * number of tile columns. @@ -408,25 +327,6 @@ public: firstChunkReuseSmem.emplace_back(false); } - // SmemConstSfBuf - // A buffer used to copy constant values to TMEM. - { - // Do we need the buffer? - bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; - // Number of bytes for the buffer. - auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; - // Number of bytes for the alignment of the buffer. - auto const numBytesAlignmentConstSfBuf = 16; - // No need to reuse the first chunk. - auto const reuseChunksSmemConstSfBuf = false; - - // Add info. - smemChunkNames.emplace_back("smemConstSfBuf"); - numBytesAndAlignmentPerSmemChunk.emplace_back( - std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); - firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); - } - // Create SMEM helper object. mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); @@ -470,12 +370,10 @@ public: // Matrix A { - // We use TMEM for A if we use slice-K or if we need to cast A. - bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); // Number of columns for A. - auto const numTmemColsA = useTmemA ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) - : 0; + auto const numTmemColsA = numSlicesForSliceK > 1 ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeElt)) + : 0; // Number of columns for A alignment. auto const numColsAlignmentA = 4; // No need to reuse TMEM. @@ -487,18 +385,15 @@ public: firstChunkReuseTmem.emplace_back(reuseChunksTmemA); } + bool const useBlockScaling = tg::dtypeIsBlockFmt(dtypeElt); + // Sf A { - // Does the MMA require block scales in TMEM for A? - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); - // Are the block scales constant? - bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); // Number of columns for scaling factors of A. - auto const numTmemColsSfA = useConstSfA - ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileM, 64), 4) - : (useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0); + auto const numTmemColsSfA + = useBlockScaling ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0; // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 4; + auto const numColsAlignmentSfA = 2; // No need to reuse TMEM. auto const reuseChunksTmemSfA = false; @@ -510,16 +405,11 @@ public: // Sf B { - // Does the MMA require block scales in TMEM for B? - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); - // Are the block scales constant? - bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); // Number of columns for scaling factors of B. - auto const numTmemColsSfB = useConstSfB - ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileN, 64), 4) - : (useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0); + auto const numTmemColsSfB + = useBlockScaling ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0; // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 4; + auto const numColsAlignmentSfB = 2; // No need to reuse TMEM. auto const reuseChunksTmemSfB = false; @@ -536,8 +426,6 @@ public: } public: - // The MMA kind. - tg::MmaKind mMmaKind; // Helper for SMEM allocation. MemAllocatorHelper mSmemAllocatorHelper; // Helper for TMEM allocation. @@ -566,14 +454,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); + return traits.mSmemAllocatorHelper.getChunkOffset(0); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); + return traits.mSmemAllocatorHelper.getChunkOffset(1); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -587,63 +475,50 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); + return traits.mSmemAllocatorHelper.getChunkOffset(2); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); + return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); + return traits.mSmemAllocatorHelper.getChunkOffset(5); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); + return traits.mSmemAllocatorHelper.getChunkOffset(6); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -inline int32_t getSmemOffsetBias(KernelTraits traits) -{ - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); + return traits.mSmemAllocatorHelper.getChunkOffset(7); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) -{ - return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); + return traits.mSmemAllocatorHelper.getChunkOffset(8); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); + // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -654,32 +529,30 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); + return traits.mTmemAllocatorHelper.getChunkOffset(0); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); + return traits.mTmemAllocatorHelper.getChunkOffset(1); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); + return traits.mTmemAllocatorHelper.getChunkOffset(2); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); + return traits.mTmemAllocatorHelper.getChunkOffset(3); } //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h index 159169e4a8..8d26c4b972 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h @@ -17,7 +17,6 @@ #pragma once #include "trtllm/gen/DtypeDecl.h" -#include "trtllm/gen/MmaDecl.h" #include #ifdef TLLM_ENABLE_CUDA @@ -26,9 +25,6 @@ #include #endif -namespace gemmGatedAct -{ - namespace gemm { @@ -40,15 +36,13 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA -inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, - std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) +inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& shapes, + std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr, bool doSwizzle = true) { - // The multiplication factor of the data padding in SMEM. - int32_t padMultiplier = 1; CUtensorMap desc{}; // The data type. CUtensorMapDataType tmaDataFormat{CU_TENSOR_MAP_DATA_TYPE_FLOAT32}; - if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3 || dtype == tg::Dtype::UE8m0) + if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8; } @@ -64,56 +58,36 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } - else if (dtype == tg::Dtype::MxE2m1) - { - if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) - { - padMultiplier = 2; - tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; - } - else - { - // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision - // type such as Bfloat16 before the MMA. - tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; - } - } else if (dtype == tg::Dtype::Fp32) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_FLOAT32; } else { - std::cerr << "buildNdTmaDescriptor: unexpected dtype " << tg::dtypeToString(dtype) << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected dtype " << static_cast(dtype) << std::endl; assert(false); } // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; + int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype)) / /* bits */ 8; if (doSwizzle) { - if ((fastestDimTileSizeBytes % 128) == 0) + if ((tileKSizeInBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((fastestDimTileSizeBytes % 64) == 0) + else if ((tileKSizeInBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((fastestDimTileSizeBytes % 32) == 0) + else if ((tileKSizeInBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; - // This path is only for the scaling factors. - } - else if ((fastestDimTileSizeBytes % 16) == 0 && (dtype == tg::Dtype::UE8m0 || dtype == tg::Dtype::E4m3)) - { - swizzleType = CU_TENSOR_MAP_SWIZZLE_NONE; } else { - std::cerr << "buildNdTmaDescriptor: unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes - << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; assert(false); } } @@ -123,9 +97,8 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions for regular gemm, 3 dimensions for batched gemm or blocked layout, and 4 - // dimensions for batched gemm with blocked layout. - assert(dim == 2 || dim == 3 || dim == 4); + // Expect 2 dimensions. + assert(dim == 2 || dim == 3); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -146,78 +119,63 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st } // Set the number of elements in the packed uint32_t element. - auto const numEltsPerUInt32 = 4 * /* bits */ 8 / (tg::dtypeGetNumBits(dtype) * padMultiplier); + auto const numEltsPerUInt32 = 4 * /* bits */ 8 / tg::dtypeGetNumBits(dtype); // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); + auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); - // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. - assert(static_cast(tileShapes.size()) <= dim); - std::vector boxDim(dim, 1); - boxDim[0] = numEltsInClampedFastestTileSize; - for (size_t ii = 1; ii < tileShapes.size(); ++ii) - { - if (tileShapes[ii] > 256) - { - std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; - assert(false); - } - else - { - boxDim[ii] = tileShapes[ii]; - } - } + // Build tile shapes. + std::vector tileShapes(dim, 1); + tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK + tileShapes[1] = tileSizeMn; // tileSizeMn // Set tile strides to 1; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); if (result != CUDA_SUCCESS) { - char const* errorString; - cuGetErrorString(result, &errorString); - std::stringstream ss; - ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; + std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; - ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; + std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr + << std::endl; - ss << "Shape: "; + std::cerr << "Shape: "; for (int ii = 0; ii < dim; ++ii) { - ss << shapes[ii] << " "; + std::cerr << shapes[ii] << " "; } - ss << std::endl; + std::cerr << std::endl; - ss << "Stride: "; + std::cerr << "Stride: "; for (int ii = 0; ii < dim - 1; ++ii) { - ss << stridesInBytes[ii] << " "; + std::cerr << stridesInBytes[ii] << " "; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileShapes: "; + std::cerr << "tileShapes: "; for (int ii = 0; ii < dim; ++ii) { - ss << boxDim[ii] << " "; + std::cerr << tileShapes[ii] << " "; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileStrides: "; + std::cerr << "tileStrides: "; for (int ii = 0; ii < dim; ++ii) { - ss << tileStrides[ii] << " "; + std::cerr << tileStrides[ii] << " "; } - ss << std::endl; - ss << "swizzleType: " << int(swizzleType) << std::endl; - ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; - throw std::runtime_error(ss.str()); + std::cerr << std::endl; + std::cerr << "swizzleType: " << int(swizzleType) << std::endl; + assert(false); } return desc; @@ -235,7 +193,7 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c } else { - std::cerr << "buildSfTmaDescriptor: unexpected dtype " << tg::dtypeToString(dtype) << std::endl; + std::cerr << "buildSfTmaDescriptor: unexpected dtype " << static_cast(dtype) << std::endl; assert(false); } @@ -285,44 +243,41 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - char const* errorString; - cuGetErrorString(result, &errorString); - std::stringstream ss; - ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; + std::cerr << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; - ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; + std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr + << std::endl; - ss << "shape:"; + std::cerr << "shape:"; for (uint32_t shape_i : shapes) { - ss << " " << shape_i; + std::cerr << " " << shape_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "stridesInBytes:"; + std::cerr << "stridesInBytes:"; for (uint32_t stride_i : stridesInBytes) { - ss << " " << stride_i; + std::cerr << " " << stride_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileShapes:"; + std::cerr << "tileShapes:"; for (uint32_t tileShape_i : tileShapes) { - ss << " " << tileShape_i; + std::cerr << " " << tileShape_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "tileStrides:"; + std::cerr << "tileStrides:"; for (uint32_t tileStride_i : tileStrides) { - ss << " " << tileStride_i; + std::cerr << " " << tileStride_i; } - ss << std::endl; + std::cerr << std::endl; - ss << "swizzleType: " << int(swizzleType) << std::endl; - ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; - throw std::runtime_error(ss.str()); + std::cerr << "swizzleType: " << int(swizzleType) << std::endl; + assert(false); } return desc; @@ -333,5 +288,3 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index b72fab1548..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f2ae6e5978bcf290d8a8c15543d2d0cfebd7ced8a50b649b31be7fbb7957d70 -size 510060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..5a1179b298 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9434907544b528b0f46ab2472402b64bf13a3405745d2c1ce02b2f962bc7c774 +size 297623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..4df2e220d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b00b1627dea02258b7b5d895de301efd75c742ac468a220865aea997a704fb4c +size 331481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 6370417f12..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d26e45bd5ce154380840c146fdb43c0890f511d9d70d74ef04a321ebcd9af432 -size 360460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 2d660147cb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e119632f7ff3805be4df7365d88bde6dbd75624ef32f89096d29e0c8b135ed8 -size 272800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 26efe10198..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:782779a7c5eddff644b62230baf8a2a22287bfd12e76be5f11f8691a2320ddbc -size 310704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp new file mode 100644 index 0000000000..e8d20c2364 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418acca7365bcedd4fa73d756169184484666ffb5bfee08a281946e6c4d6d94b +size 297745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..e613213678 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98ee8d43aa5f90e048eb334711b861de202f769457cc4db1703ef71d6c75ccf1 +size 222291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..abf57d4c8f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e85b4cb09dda8f57b068b53030f19cdd0552407381bbc5bd50e32490e18c5e16 +size 254473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 62081ce940..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b369ce4afcb6d7f9da536f1d6f0fe6d3fd019fdffb319c6e49d898a9346c44f3 -size 520954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..409920f595 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a98affa59323662168205c915685f87a95c2408fad1c406056f226ec8739333b +size 305853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..ee2186243f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:460cf57675a1137a9cf0d851413404e2bf2ace1a52a6b427235f94ba1d580efa +size 342425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 0dd53248ab..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b02f9381be807a90808a0fca8d167bdff29d70d09e384d55633785d6d21ff4b5 -size 358084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 49fb3961d5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:edeec84c08133c01a168816bd33fe8b37d3ae8a26eb46b65e674f617ae6fda00 -size 272742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index a3b8577865..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e637312ef7ac2850c19891eb235497da4e38ba4e3ed609101fb65fdbf67260c -size 309068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp new file mode 100644 index 0000000000..72f329db05 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dbdc8e54e2187ffb345d5a797c9a7039066fb746f253220121095af454924dd +size 300203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..bd8f1bf51e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff17b22e983563501c3339ee7160475e2086cada9e957afb9a1ff89e901b8446 +size 221445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..a94e66978c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2dbad0de7cd53b39225d96ceaa071e1b27ce3feff861691b7ada7bde9e9411 +size 254415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index ea73d95c76..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85de6b65b89568cc6b3c993fe10cf9d61840aa7309df36c72e70a189d3e4f216 -size 510052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..cea5c48ce0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b99d98321f44aea8334cfa4dea7317fc0e7dd2eec9b972106bfae5e1bf740bf +size 296825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..304dcd84b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef2353b8bcd016f7f35df660d958cb1b310a944915f655d6984441b264b6855 +size 330685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index cf0b9b0ac9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:84100e744c88060db0cb9470db5f520b434a8e0552a48729d102e51add0a1f84 -size 358872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 2b9997d9f3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:811d0ef02dda31d7ac4e6628c3c5e4eff5894d9e6caecea7f629675e3f2949ae -size 272792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index 81bba4befd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3E4m3_Fp32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c3388ae7d4d3adc92678728b7d86389321a22bde733ddfd9044c036da13798ab -size 309118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp new file mode 100644 index 0000000000..21625a8ff0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3841f0321dbfe7785af9b2c10f4178b664bf6b8166b02ac670ba379868a292b8 +size 296947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..e110963d8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4915aeb2ad55d39bc8dc15d9d528710452271ac3959b44c93fdee4fe625dfdb8 +size 221493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..0bc89b9de3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9416865d0b59145c1ed6db76342e013dfa21137679aac4b56a44cf862fec0f62 +size 254465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp deleted file mode 100644 index c8854d94ed..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s4_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bac681e5c0b482c6d7e94b87b0d3ff2060eb777098940b322de69f2dfe1f5996 -size 510150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..c9201422eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8570922d77439b30ef9f1bd0f1de2e48cd0688c52d4c5d7b48a8ab9df80678bb +size 297023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp new file mode 100644 index 0000000000..108db9b8d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9b1bac0bdb963a4b5ae0d2c63c77495dcab658079b742b59ec9c9965c9fbea +size 331573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h index 680f22271a..0efa93faf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h @@ -16,9 +16,6 @@ */ #pragma once -namespace gemmGatedAct -{ - namespace trtllm { namespace gen @@ -44,5 +41,3 @@ inline T roundUp(T m, T n) } // namespace gen } // namespace trtllm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h index 60e07c6fce..5d31c37411 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h @@ -22,9 +22,6 @@ #include #include #endif -namespace gemmGatedAct -{ - namespace trtllm { namespace gen @@ -92,5 +89,3 @@ inline CUresult launchKernel(void* kernelParams, void* cudaStream, int32_t smemS } // namespace gen } // namespace trtllm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h index ce0670f9e7..a6892f12ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h @@ -20,11 +20,6 @@ #include #include #include -#ifndef TLLM_GEN_EXPORT_INTERFACE -#include "trtllm/gen/MmaDecl.h" -#else -#include "MmaDecl.h" -#endif //////////////////////////////////////////////////////////////////////////////////////////////////// // @@ -33,9 +28,6 @@ // //////////////////////////////////////////////////////////////////////////////////////////////////// -namespace gemmGatedAct -{ - namespace trtllm { namespace gen @@ -63,25 +55,26 @@ enum class Dtype : uint32_t // clang-format off Bfloat16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 16u, /*uid*/ 0u), Bool = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 1u, /*uid*/ 1u), - E2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 2u), - E2m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 3u), - E3m2 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 4u), - E4m3 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 5u), - E5m2 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 6u), - Fp16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 16u, /*uid*/ 7u), - Fp32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 32u, /*uid*/ 8u), - Int8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 9u), - Int32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 10u), - Int64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 11u), - MxE2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 12u), - MxE4m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 13u), - UE8m0 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 14u), - UInt8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 15u), - UInt16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 16u, /*uid*/ 16u), - UInt32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 17u), - UInt64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 18u), - UInt128 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 128u, /*uid*/ 19u), - Void = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 0u, /*uid*/ 20u), + PadType = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 2u), + E2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 3u), + E2m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 4u), + E3m2 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 5u), + E4m3 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 6u), + E5m2 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 7u), + Fp16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 16u, /*uid*/ 8u), + Fp32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 32u, /*uid*/ 9u), + Int8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 10u), + Int32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 11u), + Int64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 12u), + MxE2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 13u), + MxE4m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 14u), + UE8m0 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 15u), + UInt8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 16u), + UInt16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 16u, /*uid*/ 17u), + UInt32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 18u), + UInt64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 19u), + UInt128 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 128u, /*uid*/ 20u), + Void = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 0u, /*uid*/ 21u), // clang-format on #undef TLLM_ENCODE_DTYPE @@ -160,7 +153,6 @@ inline std::string dtypeToString(Dtype dtype) case Dtype::Int32: return "Int32"; case Dtype::Int64: return "Int64"; case Dtype::MxE4m3: return "MxE4m3"; - case Dtype::MxE2m1: return "MxE2m1"; case Dtype::UE8m0: return "UE8m0"; case Dtype::UInt8: return "UInt8"; case Dtype::UInt16: return "UInt16"; @@ -213,50 +205,5 @@ inline Dtype dtypeGetBlockSfType(Dtype dtype) //////////////////////////////////////////////////////////////////////////////////////////////////// -inline MmaKind dtypeGetMmaKind(Dtype dtypeA, Dtype dtypeB) -{ - auto dtypeEltA = dtypeEltType(dtypeA); - auto dtypeEltB = dtypeEltType(dtypeB); - - // Note: the order of the conditions is important here. - if ((dtypeA == Dtype::Fp16 && dtypeB == Dtype::Fp16) || (dtypeA == Dtype::Bfloat16 && dtypeB == Dtype::Bfloat16)) - { - return MmaKind::Fp16; - } - - if ((dtypeA == Dtype::Int8 || dtypeA == Dtype::UInt8) && (dtypeB == Dtype::Int8 || dtypeB == Dtype::UInt8)) - { - return MmaKind::Int8; - } - - // This statement captures both MxE2m1 and E2m1. - if (dtypeEltA == Dtype::E2m1 && dtypeEltB == Dtype::E2m1) - { - return MmaKind::MxFp4NvFp4; - } - - if ((dtypeA == Dtype::E4m3 || dtypeA == Dtype::E5m2 || dtypeA == Dtype::E2m3 || dtypeA == Dtype::E3m2 - || dtypeA == Dtype::E2m1) - && (dtypeB == Dtype::E4m3 || dtypeB == Dtype::E5m2 || dtypeB == Dtype::E2m3 || dtypeB == Dtype::E3m2 - || dtypeB == Dtype::E2m1)) - { - return MmaKind::Fp8Fp6Fp4; - } - - // At this point we know that both dtypes are Mx types and not both MxE2m1 at the same time. - if ((dtypeEltA == Dtype::E4m3 || dtypeEltA == Dtype::E5m2 || dtypeEltA == Dtype::E2m3 || dtypeEltA == Dtype::E3m2 - || dtypeEltA == Dtype::E2m1) - && (dtypeEltB == Dtype::E4m3 || dtypeEltB == Dtype::E5m2 || dtypeEltB == Dtype::E2m3 || dtypeEltB == Dtype::E3m2 - || dtypeEltB == Dtype::E2m1)) - { - return MmaKind::MxFp8Fp6Fp4; - } - return MmaKind::Tf32; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace gen } // namespace trtllm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h deleted file mode 100644 index f3822f89fa..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & - * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace gemmGatedAct -{ - -namespace trtllm -{ -namespace gen -{ - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -// The kind of the MMA instruction -enum class MmaKind : uint32_t -{ - // For Blackwell this follows the PTX ISA description of the MMA instructions. - // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-kind-shapes - - // The MMA type is auto-detected from the dtypes of the input tensors - Auto = 0, - // Supports dtypeA = dtypeB = Fp16 and dtypeD = [Fp16, Fp32] - // or dtypeA = dtypeB = Bfloat16 and dtypeD = [Fp32] - // Corresponds to the kind::f16 of tcgen05.mma. - Fp16 = 1, - // Supports dtypeA/B = [E4m3, E5m2, E2m3, E3m2, E2m1] and dtypeD = [Fp16, Fp32] - // Corresponds to the kind::f8f6f4 of tcgen05.mma. - Fp8Fp6Fp4 = 2, - // Supports dtypeA = dtypeB = [Int8, Uint8] and dtypeD = [Int32] - // Corresponds to the kind::i8 of tcgen05.mma. - Int8 = 3, - // Supports dtypeA = dtypeB = [MxE2m1, E2m1] with block scale [UM8e0, UEm4e3] - // and dtypeD = [Fp32] - // Corresponds to the kind::mxf4nvf4 of tcgen05.mma. - MxFp4NvFp4 = 4, - // Supports dtype dtypeA = dtypeB = [MxE4m3, MxE2m1] with block scale [UM8e0] - // and dtypeD = [Fp32] - // Corresponds to the kind::mxf8f6f4 of tcgen05.mma. - MxFp8Fp6Fp4 = 5, - // Supports dtypeA = dtypeB = Tf32 with dtypeD = [Fp32] - // Corresponds to the kind::tf32 of tcgen05.mma. - Tf32 = 6 -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -inline bool mmaKindIsBlockFmt(MmaKind mmaKind) -{ - return mmaKind == MmaKind::MxFp8Fp6Fp4 || mmaKind == MmaKind::MxFp4NvFp4; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -// For logging and error reporting -inline std::string mmaKindToString(MmaKind mmaKind) -{ - switch (mmaKind) - { - case MmaKind::Auto: return "Auto"; - case MmaKind::Fp16: return "Fp16"; - case MmaKind::Fp8Fp6Fp4: return "Fp8Fp6Fp4"; - case MmaKind::Int8: return "Int8"; - case MmaKind::MxFp4NvFp4: return "MxFp4NvFp4"; - case MmaKind::MxFp8Fp6Fp4: return "MxFp8Fp6Fp4"; - case MmaKind::Tf32: return "Tf32"; - default: assert(false); return "Unsupported type"; - } -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace gen -} // namespace trtllm - -} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h index 9dca3cce24..f86c383259 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h @@ -26,9 +26,6 @@ // //////////////////////////////////////////////////////////////////////////////////////////////////// -namespace gemmGatedAct -{ - namespace trtllm { namespace gen @@ -92,5 +89,3 @@ inline std::string sfLayoutToString(SfLayout layout) } // namespace gen } // namespace trtllm - -} // namespace gemmGatedAct