From 345c2bceaa0eea04bfb46d012712598db1da1a4e Mon Sep 17 00:00:00 2001 From: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> Date: Mon, 4 Aug 2025 18:15:25 +0800 Subject: [PATCH] update trtllm-gen sm100f cubins of gemm kernels Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com> --- .../BatchedGemmInterface.h | 62 +- .../trtllmGen_bmm_export/BatchedGemmOptions.h | 61 +- .../GemmGatedActOptions.h | 9 +- .../trtllmGen_bmm_export/GemmOptions.h | 5 +- .../trtllmGen_bmm_export/KernelMetaInfo.h | 1524 +++-- .../trtllmGen_bmm_export/KernelParams.h | 1421 ++-- .../trtllmGen_bmm_export/KernelParamsDecl.h | 547 ++ .../trtllmGen_bmm_export/KernelTraits.h | 74 +- .../trtllmGen_bmm_export/TmaDescriptor.h | 6 +- .../trtllmGen_bmm_export/config.json | 14 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...tput_dsFp8_batchN_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ...eMmaOutput_batchN_dynamic_sm100a_cubin.cpp | 3 - ...persistent_batchN_dynamic_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...atchN_routeLdgsts_dynamic_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...oseMmaOutput_dsFp8_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ...transposeMmaOutput_batchN_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ..._routeLdgsts_silu_dynamic_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...nsOut_schedS_bN_dynBatch_sm100a_cubin.cpp} | 4 +- ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ..._dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 + ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedP_bN_dynBatch_sm100a_cubin.cpp | 3 - ...ansOut_schedS_bN_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ..._transOut_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 - ...t_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 - ...56b_TN_transOut_schedS_bN_sm100a_cubin.cpp | 3 + ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - ...S_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp | 3 - .../trtllmGenKernels/gemm/KernelRunner.cpp | 4 +- .../gemm/trtllmGen_gemm_export/Enums.h | 56 + .../trtllmGen_gemm_export/GemmInterface.h | 70 +- .../gemm/trtllmGen_gemm_export/GemmOptions.h | 340 +- .../trtllmGen_gemm_export/KernelMetaInfo.h | 833 ++- .../gemm/trtllmGen_gemm_export/KernelParams.h | 774 +-- .../trtllmGen_gemm_export/KernelParamsDecl.h | 324 + .../gemm/trtllmGen_gemm_export/KernelTraits.h | 174 +- .../trtllmGen_gemm_export/TmaDescriptor.h | 133 +- .../gemm/trtllmGen_gemm_export/config.json | 9 - ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...tK2_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...tK2_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ..._transposeMmaOutput_dsFp8_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...tK2_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...luster1x1x1_16dp256bit_TN_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...bit_TN_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...plitK2_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...plitK2_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...sOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...plitK2_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...a1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + ...dp256b_TN_transOut_schedS_sm100a_cubin.cpp | 3 + .../gemmGatedAct/KernelRunner.cpp | 22 +- .../gemmGatedAct/KernelRunner.h | 4 +- .../trtllmGen_gatedAct_export/Enums.h | 61 + .../GemmGatedActInterface.h | 110 +- .../GemmGatedActOptions.h | 50 +- .../trtllmGen_gatedAct_export/GemmOptions.h | 602 +- .../KernelMetaInfo.h | 5904 ++++++++++++++++- .../trtllmGen_gatedAct_export/KernelParams.h | 308 +- .../trtllmGen_gatedAct_export/KernelTraits.h | 207 +- .../trtllmGen_gatedAct_export/TmaDescriptor.h | 153 +- ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...ma128x128x32_cluster1x1x1_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...ma128x128x32_cluster1x1x1_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...ma128x128x32_cluster1x1x1_sm100a_cubin.cpp | 3 - ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...r1x1x1_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...plitK4_transposeMmaOutput_sm100a_cubin.cpp | 3 - ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + ...TN_transOut_schedS_swiGlu_sm100a_cubin.cpp | 3 + .../trtllm/gen/CommonUtils.h | 5 + .../trtllm/gen/CudaKernelLauncher.h | 5 + .../trtllm/gen/DtypeDecl.h | 93 +- .../trtllm/gen/MmaDecl.h | 90 + .../trtllm/gen/SfLayoutDecl.h | 5 + cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp | 1 + .../thop/fp8PerTensorScalingTrtllmGenGemm.cpp | 10 +- 582 files changed, 11547 insertions(+), 4159 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp rename cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/{BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp => Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp} (81%) delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp delete mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp create mode 100644 cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 53bd7bc33c..49d23d13dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -244,11 +244,48 @@ struct BatchedGemmData // Shape is [B]. float const* mPtrScaleGate{nullptr}; + // The clamp limit for the accumulator before applying the activation. + // Shape is [B]. + // Clamp is INF if nullptr. + // When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb. + // If applied on SwiGlu, it will be: + // + // x_glu = x_glu.clamp(min=None, max=limit) + // x_linear = x_linear.clamp(min=-limit, max=limit) + // + // The given clamp limit applies to the dequantized values, so the order of operations would + // look something like this: + // + // x0 = x0 * dqAb + // x0 = clamp(x0, none, limit) + // x0 = x0 * sigmoid(alpha * x0) + // x1 = dqAb * x1 + // x1 = clamp(x1, -limit, limit) + // out = qC * (x1 + beta) * x0 + // + // Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp + // limit and apply the clamping prior to dequantization: + // + // x0 = clamp(x0, none, limit / dqAb) + // x0 = x0 * dqAb + // x0 = x0 * sigmoid(alpha * x0) + // x1 = clamp(x1, -limit / dqAb, limit / dqAb) + // scaleC = dqAb * qC + // beta' = beta / dqAb + // out = scaleC * (x1 + beta') * x0 + // + // Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case + // + float const* mPtrClampLimit{nullptr}; + // The alpha and beta for SwiGlu. // gatedActivation <- (x0 + beta) * activation(x1, alpha) // Shape is [B]. // Alpha is 1.f if nullptr. // Beta is 0.f if nullptr. + // The formula: + // + // out_glu = x_glu * torch.sigmoid(alpha * x_glu) + (x_linear + beta) float const* mPtrSwiGluAlpha{nullptr}; float const* mPtrSwiGluBeta{nullptr}; @@ -591,6 +628,7 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa { // Might be used. (void) usePdl; + (void) moduleCache; // Get options from config and data. auto options = getOptionsFromConfigAndData(config, batchedGemmData); @@ -642,17 +680,17 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa auto const numCtaZ = options.mNumSlicesForSplitK; mNumCtas = numCtaX * numCtaY * numCtaZ; - auto kernelParams = KernelParams::setKernelParams(options, batchM, batchedGemmData.mInputBuffers.mPtrA, + auto kernelParams = KernelParamsSetup::setKernelParams(options, batchM, batchedGemmData.mInputBuffers.mPtrA, batchedGemmData.mInputBuffers.mPtrB, batchedGemmData.mOutputBuffers.mPtrC, batchedGemmData.mInputBuffers.mPtrSfA, batchedGemmData.mInputBuffers.mPtrSfB, batchedGemmData.mInputBuffers.mPtrPerTokenSfA, batchedGemmData.mInputBuffers.mPtrPerTokenSfB, batchedGemmData.mInputBuffers.mPtrBias, batchedGemmData.mOutputBuffers.mPtrSfC, batchedGemmData.mInputBuffers.mPtrScaleC, batchedGemmData.mInputBuffers.mPtrScaleGate, - batchedGemmData.mInputBuffers.mPtrSwiGluAlpha, batchedGemmData.mInputBuffers.mPtrSwiGluBeta, - batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax, dPtrRowMaxBars, - batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas, batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, - batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, - maxNumCtasInBatchDim); + batchedGemmData.mInputBuffers.mPtrClampLimit, batchedGemmData.mInputBuffers.mPtrSwiGluAlpha, + batchedGemmData.mInputBuffers.mPtrSwiGluBeta, batchedGemmData.mInputBuffers.mPtrRouteMap, dPtrRowMax, + dPtrRowMaxBars, batchedGemmData.mInputBuffers.mPtrNumNonExitingCtas, + batchedGemmData.mInputBuffers.mPtrTotalNumPaddedTokens, batchedGemmData.mInputBuffers.mPtrCtaIdxXyToBatchIdx, + batchedGemmData.mInputBuffers.mPtrCtaIdxXyToMnLimit, maxNumCtasInBatchDim); // The size of the grid. std::vector grid{numCtaX, numCtaY, numCtaZ}; @@ -660,26 +698,26 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa #ifdef TLLM_GEN_EXPORT_INTERFACE CUmodule cuModule; CUfunction cuFunction; + if (moduleCache.has_value()) { ModuleCache& moduleCacheRef = moduleCache.value().get(); - // Modules are associated with a specific context so include the ctxId in the key + // Modules are associated with a specific context, so the context is included in the key CUcontext ctx; unsigned long long ctxId; cuCtxGetCurrent(&ctx); cuCtxGetId(ctx, &ctxId); - // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal - // representation. + // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a + // string in decimal representation. std::string const ctxName = std::string(reinterpret_cast(&ctxId), sizeof(unsigned long long) / sizeof(char)); std::string const funcName = std::string(config.mFunctionName); - // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision auto const moduleKey = ctxName + funcName; auto module = moduleCacheRef.find(moduleKey); - // Check if module exists in cache. Otherwise, load it + // Use cache if module is found, otherwise load and insert into cache if (module != moduleCacheRef.end()) { cuFunction = std::get<1>(module->second); @@ -716,7 +754,7 @@ int32_t BatchedGemmInterface::run(BatchedGemmConfig const& config, void* workspa { return -1; } - // If a module cache has not been given, unload the module to avoid overflow + // If a module cache has not been given, unload the module to avoid leaking if (!moduleCache.has_value()) { cuModuleUnload(cuModule); diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h index 148a2cb185..01b85f4bcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h @@ -96,10 +96,10 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, - int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, + int32_t sfReshapeFactor, gemm::TileScheduler tileScheduler, gemmGatedAct::ActType actType, bool clampBeforeAct, std::vector batchedM, std::vector batchedN, BatchMode batchMode, int numBatches, bool isStaticBatch, int numTokens, RouteImpl routeImpl, bool gridWaitForPrimaryRouting, bool fusedAct, - int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps) + int numRegsPerThreadNonEpilogueWarp, int numRegsPerThreadEpilogueWarp, int numRegsCastAWarps, bool useTmaOobOpt) : gemmGatedAct::GemmGatedActOptions( gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, enablesGlobalPtxKnobs, @@ -112,19 +112,20 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions useCustomMmaSchedule, useHoistTryWaitForCustomMmaSchedule, useDeepSeekFp8, usePerTokenSfA, usePerTokenSfB, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, tileScheduler), - actType) + actType, clampBeforeAct) , mBatchedM(batchedM) , mBatchedN(batchedN) , mBatchMode(BatchMode(batchMode)) - , mNumBatches(numBatches) - , mIsStaticBatch(isStaticBatch) - , mNumTokens(numTokens) - , mRouteImpl(routeImpl) - , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting) , mFusedAct(fusedAct) + , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting) + , mIsStaticBatch(isStaticBatch) + , mNumBatches(numBatches) , mNumRegsPerThreadNonEpilogueWarp(numRegsPerThreadNonEpilogueWarp) , mNumRegsPerThreadEpilogueWarp(numRegsPerThreadEpilogueWarp) , mNumRegsCastAWarps(numRegsCastAWarps) + , mNumTokens(numTokens) + , mRouteImpl(routeImpl) + , mUseTmaOobOpt(useTmaOobOpt) { } @@ -134,28 +135,28 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions std::vector mBatchedN; // Whether batching M or N. BatchMode mBatchMode{BatchMode::BatchM}; - // Number of Gemm batches. - int mNumBatches; - - // Whether the batch size is static (i.e. known at kernel launch time). - bool mIsStaticBatch{true}; - // Total number of tokens. - int mNumTokens{32}; - // Whether load the input tokens and do routing. - RouteImpl mRouteImpl{RouteImpl::NoRoute}; + // Whether to perform a fused gated activation. + bool mFusedAct{false}; // Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens, // ptrCtaIdxXyToBatchIdx, etc.. should wait on a grid dependency. bool mGridWaitForPrimaryRouting{true}; - - // Whether to perform a fused gated activation. - bool mFusedAct{false}; - + // Whether the batch size is static (i.e. known at kernel launch time). + bool mIsStaticBatch{true}; + // Number of Gemm batches. + int mNumBatches; // Number of registers per thread for non-epilogue warps int mNumRegsPerThreadNonEpilogueWarp{0}; // Number of registers per thread for epilogue warps int mNumRegsPerThreadEpilogueWarp{0}; // Number of registers for the cast A warps. int mNumRegsCastAWarps{0}; + // Total number of tokens. + int mNumTokens{32}; + // Whether load the input tokens and do routing. + RouteImpl mRouteImpl{RouteImpl::NoRoute}; + // Whether to use TMA out-of-bounds optimization to reduce wasted traffic. See details in + // BatchedGemm/KernelParamsDecl.h. + bool mUseTmaOobOpt{false}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -165,6 +166,20 @@ bool checkAndUpdateBatchedGemmOptions(BatchedGemmOptions& options, bool isBlackw { bool isValid = true; + if (options.mUseTmaOobOpt && !options.mUseTwoTmaLoadWarps) + { + if (updateOptions) + { + // Since any routing (mRouteAct != NoRoute) requires mUseTwoTmaLoadWarps == true. + // Single TMA load warp is not the target use case for OOB optimization. + options.mUseTmaOobOpt = false; + } + else + { + TLLM_CHECK_ERROR(false, "TMA OOB optimization requires two TMA load warps."); + return false; + } + } if (options.mFusedAct) { // ensure that we check the fused options as well @@ -340,6 +355,7 @@ struct BatchedGemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -366,7 +382,8 @@ inline std::string dumpOptions(BatchedGemmOptions const& options) ss << "mFusedAct=" << options.mFusedAct << "," << std::endl; ss << "mNumRegsPerThreadNonEpilogueWarp=" << options.mNumRegsPerThreadNonEpilogueWarp << "," << std::endl; ss << "mNumRegsPerThreadEpilogueWarp=" << options.mNumRegsPerThreadEpilogueWarp << "," << std::endl; - ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << std::endl; + ss << "mNumRegsCastAWarps=" << options.mNumRegsCastAWarps << "," << std::endl; + ss << "mUseTmaOobOpt=" << options.mUseTmaOobOpt << std::endl; return ss.str(); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h index 1eb8361522..deedee27ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmGatedActOptions.h @@ -101,14 +101,17 @@ struct GemmGatedActOptions : public gemm::GemmOptions { GemmGatedActOptions() = default; - GemmGatedActOptions(gemm::GemmOptions options, ActType actType) + GemmGatedActOptions(gemm::GemmOptions options, ActType actType, bool clampBeforeAct) : gemm::GemmOptions(options) , mActType(actType) + , mClampBeforeAct(clampBeforeAct) { } // Type of the gated activation. ActType mActType{ActType::SwiGlu}; + // Clamp the dequantized values to the range [-limit, limit]. + bool mClampBeforeAct{false}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -175,7 +178,8 @@ inline std::string dumpOptions(GemmGatedActOptions const& options) std::stringstream ss; ss << gemm::dumpOptions(options) << ", "; ss << "mActType=" - << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")" << std::endl; + << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")," << std::endl; + ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl; return ss.str(); } @@ -196,6 +200,7 @@ struct GemmGatedActConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h index 8f88db0d33..7d25c117a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h @@ -354,6 +354,7 @@ struct GemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -526,6 +527,7 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) inline bool checkAndUpdateGemmOptions( GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) { + if (options.mDtypeB == tg::Dtype::Void) { if (updateOptions) @@ -566,7 +568,8 @@ inline bool checkAndUpdateGemmOptions( // Currently, we only support {MxFp4, NvFp4} -> Bf16. TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) - && options.mDtypeMmaA == tg::Dtype::Bfloat16), + && options.mDtypeMmaA == tg::Dtype::Bfloat16) + || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); // Check that the B cast is supported. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index 9c1b31ef0f..32b52710cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -28,229 +28,229 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "72019269-dirty" -#define TLLM_GEN_EXPORT_VERSION "6.0.3.0.2.1" +#define TLLM_GEN_COMMIT "32110ebf-dirty" +#define TLLM_GEN_EXPORT_VERSION "7.0.3.0.3.0" static constexpr size_t tllmGenBatchedGemmListLen = 104; #ifndef EXCLUDE_SM_100 -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; -extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; -extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin[]; +extern unsigned char Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; +extern unsigned char Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; -extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; -extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len; +extern unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; +extern unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { #ifndef EXCLUDE_SM_100 -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "6a0bf2c102efef21017cfd8c1ea75e72cbadee5e1cb82c2abfbb2370cf28948d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -318,6 +318,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -330,8 +331,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "a3126bb4254dcab10372faae7e93a2c743ce97735fc19ba35bda963f524ea8f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -399,6 +401,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -411,8 +414,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "d6ecf38671485ed929e33358d61c931ed1851601dba1a899d4e0f7484e8aa6d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -480,6 +484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -492,8 +497,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "848bd68403ebc01cd94dcb0988f5890f914bfed6bafa4a1470b34902516eca49", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -561,6 +567,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -573,8 +580,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "25a8cb0f1b214b59a76feae166eb73dcff644be5065987846dba5eb393fcba19", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -642,6 +650,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -654,8 +663,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "0f582ab94bcee39077607621b7fed01ce79838f77f3abf17ce65439bf57ab9d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -723,6 +733,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -735,8 +746,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "94b39523665c17ec783c22b4fbf47523132146e365b4c450db6ba4b6f52afb56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -804,6 +816,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -816,8 +829,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d18a2f733180a782f35e022fbbe76417e8c143a117e6d6b80045fe1faf48d38c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -885,6 +899,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -897,8 +912,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0eb05008fb00aac51d2e132d1dad9587b602aced72a2a60a036b4ddb30acb781", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -966,6 +982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -978,8 +995,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "378127bf506de81c353b6c9ad1567f567fbc8f33b5fb799217c52e44ffc9a0e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1047,6 +1065,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1059,8 +1078,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "5bc57208f3139adf191b7a21c5fd987e85268aba7f960af0759a413b3a477510", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1128,6 +1148,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1140,8 +1161,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "3aa161b02771849cb272013bfa7964e9bf5aa10712e653fbd12213baf9a4bd4b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1209,6 +1231,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1221,8 +1244,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "dc1c044fbb88c65f06c90b998bff9dfca616a5e45c5f9189d01063d9300ca1a6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1290,6 +1314,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1302,8 +1327,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "fa6f4a009ed7eb28b0f9b3a12cd2174b5fb53bf869f698b3a708548ec34157a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1371,6 +1397,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1383,8 +1410,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "783ab74ed00ce17fcad0aa518d27fc54ffa3a5a5d35a1fae41560e55c56c4534", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1452,6 +1480,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1464,8 +1493,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "89b83df6a7fdb94d161b15c1acaa99e5d009eb1db723598d31512fc293d14825", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1533,6 +1563,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1545,8 +1576,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "54a368788db560b6ab92ba6de4503a9d715a660396f85f29859133236feb0c66", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1614,6 +1646,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1626,8 +1659,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "685ff9164cbb048dafde74cdb3a9e83fdafc3585f9d1fde713bc40fbbcc5ba1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1695,6 +1729,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1707,8 +1742,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "c21e7ffe165ed982b6515aac1e99513237efcc7c2b612d7a8ba04583d4503938", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1776,6 +1812,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1788,8 +1825,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "e94003ae6b02fbb2872455161ed537c65c13bc49ed1bb4173dc950ca36fd4d9c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1857,6 +1895,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1869,8 +1908,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "e51bf3386fd7135ef7a21eb495fb8a82eec138b68729b5f60e992c47739d7e6c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1938,6 +1978,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -1950,8 +1991,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "9f9498b2cba3f9cffc14f6f3c2b1bf5d49d175da485fdd28bdc7e33d84bc6fa5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2019,6 +2061,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2031,8 +2074,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0e1f1f36b5de227bf129c1b4ab9f9346b8e586f894dfe1ef8d110150c0ea4d03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2100,6 +2144,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2112,8 +2157,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d74ce52dbeb5c157047bf4c333ceca8bf2faaea6d453c6870e7b4e9a0bffbfc5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2181,6 +2227,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2193,8 +2240,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "480e7f648ebd09b8238ab40897e1b4c5a5034d6d82df66e64912d999fbcbd6a0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2262,6 +2310,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2274,8 +2323,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "cf2750f986199840ad6f9505db4791b98321bc81f0b097630eee5088d142f986", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2343,6 +2393,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2355,8 +2406,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "65981a268c85412e963e9eff70363151eb568d3bf5ab4177909ed36f98afa006", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2424,6 +2476,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2436,8 +2489,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "32e40d5c9a6bac77cde65c0a7dc8c16621aa84d83a9cfaf7899aa5545e068530", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2505,6 +2559,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2517,8 +2572,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "9ddd33b56695cff3d8dd10c0fcaf9499be97858bcdca9bf4a4e5d6ba0d84c0eb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2586,6 +2642,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2598,8 +2655,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "7d1f7c8a2e70a49a877399c5c3d2fc7df6f9cbaf2c0a0019c8f476485d618b6e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2667,6 +2725,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2679,8 +2738,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "9df00bde72fe7c36c09c2989183fccaaa90f7a4e79d6d0cce36c78049a270a85", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2748,6 +2808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2760,8 +2821,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "b61ee79c9c7595deee79157bd2d10de9d500b9c36d68a569b585d80ed854c35a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2829,6 +2891,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2841,8 +2904,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "1ff7246442e16863ccf377d88a7bee44e8dd96aeb16e6bdb4a6dd5f84b40ff84", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2910,6 +2974,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -2922,8 +2987,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "759fceb6fdf0026cf171b5c5aa8468a18930170c8d52813b0c3d5c4ef825b6c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2991,6 +3057,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3003,8 +3070,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "c10774753e0c8c08bf8c90aaf8a8f81d6e2dc0fda372beb895b5c3af7fc4ef31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3072,6 +3140,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3084,8 +3153,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "80ae2cb645bb1ea1a885f7223dc75f1a12a14ec1e6519e8355c9d4af179c1e30", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3153,6 +3223,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3165,8 +3236,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "a8ebd0574dc30c1e06d4da89716e4c040109d5481718e9bd33bd2d919f933e53", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3234,6 +3306,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3246,8 +3319,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "e4c4e89c846e3b3250dc10e30ac956d04d14957d738118b019e934890b00c108", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3315,6 +3389,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3327,8 +3402,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "881acb91bd5ff416df31d7e9b95206c9a7349f3e0f06d3313fa1c9e642fa88bf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3396,6 +3472,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3408,8 +3485,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "ab9799b4898c64e9c1f13d351dd32a59e290486f4e3321863d691f4029b61c42", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3477,6 +3555,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3489,8 +3568,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "ffba44ffc7b841b0b9073057e8e210c2077091bc7f6139bb553d4a701daec667", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3558,6 +3638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3570,8 +3651,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "b1711c24e234aaf742a93d5bfef0dc2c311530929f95803918c54d45e12a99c2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3639,6 +3721,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3651,8 +3734,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "0c81fd570d07a6321297366a9c9f63b48f918a4a4c7b5fdf6691a994da3d80e4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3720,6 +3804,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3732,8 +3817,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d7223efec141e24a2ed159dd0df9af8f753528628349fcd351d522a33e8a8276", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3801,6 +3887,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3813,8 +3900,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "8a965ae0938f555b7b0e73df5359c2b94c105d9c851f62946bafbf144f3c9ed9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3882,6 +3970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3894,8 +3983,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "83578cefe6e21fc33064333510c1286de8420a883cca7a93c608c4e4951677aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3963,6 +4053,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -3975,8 +4066,92 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "f479ff64bf61dfc0bcba3d4d45903dc296010c55feade97734082aff2de5d0bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 1 +, /* mNumTokens */ 0 +, /* mRouteImpl */ batchedGemm::RouteImpl(0) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "c8f81b23270d852d3e05b6450d0ed40b0d909a9f379031448a6b1bee209394bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4044,6 +4219,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4056,8 +4232,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "2e7362e02b49aa744e37838af305803da9e132e73affcad605bd9495bf3902bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4125,6 +4302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4137,8 +4315,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "051562e78b1477ddef4292dbc040048ca3c85ca6dc269b3346a8cebb7775c254", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4191,7 +4370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseUnrollLoop2xForMma */ 1 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -4206,6 +4385,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4218,8 +4398,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a", 416, "976391756f5832221aba8f0097f93a0cd02e2cde58ce5a28bfad5fef6ac645c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4287,6 +4468,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4299,8 +4481,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a", 384, "66c8412d8c3ebf58b73ec4554f40f5830904f5065d70692127004c0ab04e7330", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4368,6 +4551,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4380,89 +4564,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1052672) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 1 -, /* mNumTokens */ 0 -, /* mRouteImpl */ batchedGemm::RouteImpl(0) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "44bc9b7ab82a5c8d7766ea00c5b044139c11cd5e10d2ed63f8890046a693e6e7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4530,6 +4634,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4542,8 +4647,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "6db5429b6d878d9c6d271de05b9edaf402a36a8489e093fe79d7e78d7b0feee8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4611,6 +4717,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4623,8 +4730,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "06ed8ead992feb9910c535c2a1a91824861afdfd7e81d82bc4631746bd35ef34", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4692,6 +4800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4704,8 +4813,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "df44cda477bd4710b32d020b9832a160202dd54c446ec1a0c35c8fa3a4ea7285", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4773,6 +4883,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4785,8 +4896,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "3c781c5cde3e9cb98e53f1b1be7a527244d9250fca90d3b32af8f384d350deac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4854,6 +4966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4866,8 +4979,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "7d0191cb8570a15064c853dd7be2578c81657e1a43fe42b24271c993d4b9ddea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4935,6 +5049,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -4947,8 +5062,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "de8e30b21e02c42bdc7778a538097d2afda49b07aaca23dca798dbbf7a757759", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5016,6 +5132,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5028,8 +5145,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "959d4374bd5435bde60198df8d5cf70e29aa9879b2f1314083b77c6c0381dc1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5097,6 +5215,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5109,8 +5228,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "968c88380b4577288c7e88b0bed6eb93f1ad63c9b17dc999f3d7b7a5f4c2735b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5178,6 +5298,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5190,8 +5311,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "d6179d09f734c89c251d182d6f9377b82f1c908f02ced331fc8c5e0caeebeb75", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5259,6 +5381,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5271,8 +5394,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "112967cd390a0800afadd8ee956782779787b2598986b00951ed464b7129b9cf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5340,6 +5464,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5352,8 +5477,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "a2f3febed9a167985b958e95c0496e26d69563410da2fdfd5897d8ed7e96411b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5421,6 +5547,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5433,8 +5560,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "d463f269d50fd378c72f26faa8afa3af0b0eb2eb98aebfa369acda464d08bac1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5502,6 +5630,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5514,8 +5643,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "c00f39dadad7d89058bf39415e3182029d4d9ead814360c59287ed864e91de8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5583,6 +5713,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5595,8 +5726,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "d055950053983b9acfc627db7d794e449607a78403c4938e7dd0ce35a5bdce86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5664,6 +5796,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5676,8 +5809,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "49461402a3a0a2e6bef4ba252f922069f387951f88ffa6a17a585cf737b34082", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5745,6 +5879,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5757,8 +5892,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "1425eae206c8f26ae978ba006ca8c568aa93e558c38fbf9ae1a245412175232d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5826,6 +5962,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5838,8 +5975,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "4e312fcf2ef735587987e0ddf18ff1fea51276b0b2da421bbaadd0e80aff44a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5907,6 +6045,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -5919,8 +6058,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "4e9f19a11006291683600c47b0817498fe3acd82bbd11b1783d4b677d827b3a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5988,6 +6128,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6000,8 +6141,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "6354eed0dec928ba6e6c25b4b9d1c4385ae6f9ed9be908c82d87726ebd7317f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6069,6 +6211,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6081,8 +6224,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "a98f41f0f45bafb35078da89ac4290a1a025b235c5e96bd9fd4ab0dd78bb8e1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6150,6 +6294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6162,8 +6307,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "ae447ae26810ef8447fe5c8ec183cae3fbd4206f29965f532cf0c89861464516", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6231,6 +6377,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6243,8 +6390,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "4cdf70f8dca79f9a7cb001a2440c7ff01b1581d71d36f20032e76e5aaffcd01b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6312,6 +6460,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6324,8 +6473,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "af9f731cb3519d14d81b49896e0b4bf6f7452aaf6374a1bd62a0ea267c92d58f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6393,6 +6543,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6405,8 +6556,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "6df8273d9a887d374a2762c4df8210bdcf2fdf407b9944a904a0ebd576bf6487", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6474,6 +6626,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6486,8 +6639,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "63138b6c484124ae15eafeebcb041be45626060e93d0af7e87136ac74ba2ba4f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6555,6 +6709,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6567,8 +6722,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "e4607395041f8c0a0ad7bf84a783af9e3c82d5aec43ebda987a22ca7f3d774f1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6636,6 +6792,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6648,8 +6805,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "dd4254f835bd2ac7320c5ee7f108acf87d7c108918e8e7581f0551c78481ea23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6717,6 +6875,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6729,8 +6888,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "92681b5f9d31cb04b18dad631032459aeb37e28f94a327e3251c1e97d678ade8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6798,6 +6958,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6810,8 +6971,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "eabf7d440d758b2f8a63806443459b1278979022ed52365579b323e48e2bf2c5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6879,6 +7041,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6891,8 +7054,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "eb11125d29cbd44870617b2be27b6cc134efb42dd2076e64780bdc3a80d9012f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6960,6 +7124,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -6972,8 +7137,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "67e8a00d8bca5f44f8b93f1b19c1ac0f181a7b757c44374f3411f44f8ec87b1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7041,6 +7207,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7053,8 +7220,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "faebb1be2f8db4ae77f693a4f0a75dd3986d629009d71f929db24ee06a644c37", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7122,6 +7290,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7134,8 +7303,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "330e800b99311c05d0fdac89222b0c95c7f561474a0ca59cc1b620e01b246815", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7203,6 +7373,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7215,8 +7386,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "e3f90e1d12fa25dbf30d04e4433ac1a9d93fa328c02fd6d0d0f966726fb51344", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7284,6 +7456,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7296,8 +7469,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "34e89d6b7ac2a8a35461f2778c6909ee231fa94c7955bbd7ffe5a6c714dd0e9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7365,6 +7539,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7377,170 +7552,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 4 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 2 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(1) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mBiasType */ gemm::BiasType(0) -, /* mBlockK */ -1 -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) -, /* mDtypeA */ trtllm::gen::Dtype(1050629) -, /* mDtypeB */ trtllm::gen::Dtype(1050629) -, /* mDtypeC */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) -, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) -, /* mEnablesEarlyExit */ 1 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueLdtmDps */ 16 -, /* mEpilogueLdtmBits */ 256 -, /* mEpilogueTileM */ 64 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistLoadTaskInit */ 1 -, /* mHoistMmaTaskTryWaits */ 1 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mLayoutA */ gemm::MatrixLayout(0) -, /* mLayoutB */ gemm::MatrixLayout(0) -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaKind */ trtllm::gen::MmaKind(2) -, /* mMmaM */ 64 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 1 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 8 -, /* mNumStagesMma */ 2 -, /* mNumStagesMmaWithinWorkTile */ 2 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mPatchF2fp */ 0 -, /* mUseShuffledMatrixA */ 0 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 0 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 1 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mSfReshapeFactor */ 1 -, /* mTileScheduler */ gemm::TileScheduler(0) -, /* mActType */ gemmGatedAct::ActType(0) -, /* mBatchedM */ {} -, /* mBatchedN */ {} -, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) -, /* mNumBatches */ 2 -, /* mIsStaticBatch */ 0 -, /* mNumTokens */ 2 -, /* mRouteImpl */ batchedGemm::RouteImpl(1) -, /* mGridWaitForPrimaryRouting */ 1 -, /* mFusedAct */ 0 -, /* mNumRegsPerThreadNonEpilogueWarp */ 0 -, /* mNumRegsPerThreadEpilogueWarp */ 0 -, /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "ff3bfed14ac5707da369899773c9a6e7b179b1f788d508ef4076542a41837475", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7608,6 +7622,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7620,8 +7635,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "a77de53ef185c6c9e576d904651f4730041804db5adc0a93d000f770a35ebb4c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7674,7 +7690,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -7689,6 +7705,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(1) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7701,8 +7718,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "e48f9da3611ec12be2ac1d17826b6a65a134a3e7f3c5101b4d1961ccf182ff29", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7755,7 +7773,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 128 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 1 @@ -7770,6 +7788,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7782,8 +7801,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "216e8f52552e5156288f85a82ddb9ee95acdeac42e24be8896889de9c14785cf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7851,6 +7871,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7863,8 +7884,175 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "40de19e18295201f77feeca58ac8fc73a35a3a20e444dd68f3af137384555bcc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 4 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 2 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(1) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "9f4a2d439ef5608eb43e4683d8405e20a8231d5e05eb6105636f3b33bcdd94e3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 1 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 64 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 1 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 1 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 64 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 8 +, /* mNumStagesMma */ 2 +, /* mNumStagesMmaWithinWorkTile */ 2 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 0 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 1 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) +, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 +, /* mBatchedM */ {} +, /* mBatchedN */ {} +, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mNumBatches */ 2 +, /* mIsStaticBatch */ 0 +, /* mNumTokens */ 2 +, /* mRouteImpl */ batchedGemm::RouteImpl(1) +, /* mGridWaitForPrimaryRouting */ 1 +, /* mFusedAct */ 0 +, /* mNumRegsPerThreadNonEpilogueWarp */ 0 +, /* mNumRegsPerThreadEpilogueWarp */ 0 +, /* mNumRegsCastAWarps */ 0 +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "1dff5de9e3c79ee18c9593162a23ff5f19b2074735c0c6d68dd979296ced197d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7932,6 +8120,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -7944,8 +8133,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "53c5fcfb99477cd81d8da10033146f17fb2a7b886d7824e412127c1ced2c1865", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8013,6 +8203,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8025,8 +8216,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "fa0dd0b52f880b99f0e2bed59ca0ac46251adf291608aadf2cefd79886c8ce88", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8094,6 +8286,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8106,8 +8299,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "dd40bdc871a1fb89f5ce777833f49db5168e5a418ce9affd81779d4120148466", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8175,6 +8369,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8187,8 +8382,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "b4890c8cb043c6feebbd9cdc40fdb91314dd47d747e2e6ae18143a6b3b282450", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8256,6 +8452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8268,8 +8465,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "a0caa4f383910792fdb1fc3e7db17b7f2a8025a2a9ff5be621485321ff0f433d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8337,6 +8535,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8349,8 +8548,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "1a2c81269bc212521650e4be0a3b1442bcc8186ee39158339f40f1d2b6c0c474", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8418,6 +8618,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8430,8 +8631,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a", 384, "0cd7c32083e34e9f66e3b2902771e25da1b092ccd76dde3410d0aa7dfa9ace45", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8499,6 +8701,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8511,8 +8714,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "12334edbcc35771adaab31c1d686604741fe698dcb74af3126b78095f3c9c60f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8580,6 +8784,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8592,8 +8797,9 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "577604c36f4fe236191cf1b4977330568cc02c612e954fd3a7a2cd88b87e75c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8661,6 +8867,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) , /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) @@ -8673,7 +8880,8 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mNumRegsPerThreadNonEpilogueWarp */ 0 , /* mNumRegsPerThreadEpilogueWarp */ 0 , /* mNumRegsCastAWarps */ 0 - }, gemm::SmVersion::Sm100a }, +, /* mUseTmaOobOpt */ 0 + }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 }; // clang-format on diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index 0ebe9a94c8..79b96109c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -18,11 +18,19 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/SfLayoutDecl.h" +#include #include "BatchedGemmEnums.h" #include "Enums.h" #include "TmaDescriptor.h" +// NOTE: keep this code dependency free. It has to be included by the device code and has to be +// compilable with NVRTC. +#include "KernelParamsDecl.h" + +namespace batchedGemm +{ + namespace batchedGemm { @@ -41,976 +49,569 @@ namespace tg = trtllm::gen; //////////////////////////////////////////////////////////////////////////////////////////////////// -struct KernelParams +namespace KernelParamsSetup { #ifdef TLLM_ENABLE_CUDA - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // BatchedGemm parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - // Maximum number of CTAs - static constexpr int MaxNumCtas = 2048; +enum class MatrixType +{ + MatrixA = 0, + MatrixB, + MatrixC +}; - // TMA descriptor for A. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAbc. - // - // If batchM: - // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileM, tileK]. - // Tile box strides are [tileK, 1]. - // - // If batchN: - // If layoutA is MatrixLayout::MajorK - // Logical shape is [B, divUpMul(M, tileM), K]. - // Logical strides are [divUpMul(M, tileM) * K, K, 1]. - // Tile box shape is [1, tileM, tileK]. - // Tile box strides are [0, tileK, 1]. - // If layoutA is MatrixLayout::Mn - // Logical shape is [B, K, divUpMul(M, tileM)]. - // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM), 1]. - // Tile box shape is [1, tileK, tileM]. - // Tile box strides are [0, tileM, 1]. - // If layoutA is MatrixLayout::BlockMajorK - // Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK]. - // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM) * blockK, blockK, 1]. - // Tile box shape is [1, tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. - // Tile box strides are [0, tileM * min(blockK, tileK), min(blockK, tileK), 1]. - // where blockK is 128B. - // - // Dtype is set from options.mDtypeA. - CUtensorMap tmaA[1]; +////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Utility functions. +// +////////////////////////////////////////////////////////////////////////////////////////////////// - // TMA descriptor for B. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAbc. - // - // If batchM: - // If layoutB is MatrixLayout::MajorK - // Logical shape is [B, divUpMul(N, tileN), K]. - // Logical strides are [divUpMul(N, tileN) * K, K, 1]. - // Tile box shape is [1, tileN, tileK]. - // Tile box strides are [0, tileK, 1]. - // If layoutB is MatrixLayout::MajorMn - // Logical shape is [B, K, divUpMul(N, tileN)]. - // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN), 1]. - // Tile box shape is [1, tileK, tileN]. - // Tile box strides are [0, tileN, 1]. - // If layoutB is MatrixLayout::BlockMajorK - // Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK]. - // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN) * blockK, blockK, 1]. - // Tile box shape is [1, tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. - // Tile box strides are [0, tileN * min(blockK, tileK), min(blockK, tileK), 1]. - // where blockK is 128B. - // - // If batchN: - // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileN, tileK]. - // Tile box strides are [tileK, 1]. - // - // Dtype is set from options.mDtypeB. - CUtensorMap tmaB[1]; +template +bool useTmaOobOptA(BatchedGemmOptions const& options) +{ + return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchM && doesRouteImplUseNoRoute(options.mRouteImpl) + && options.mUseTmaOobOpt; +} - // TMA descriptor for C, (when useTmaStore is true) - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAbc. - // - // If batchM: - // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), N]. - // Logical strides are [N, 1]. - // Tile box shape is [epilogueTileM, epilogueTileN]. - // Tile box strides are [epilogueTileN, 1]. - // - // If batchN: - // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), M]. - // Logical strides are [M, 1]. - // Tile box shape is [epilogueTileN, epilogueTileM]. - // Tile box strides are [epilogueTileM, 1]. - // - // Dtype is set from options.mDtypeC. - CUtensorMap tmaC[1]; +////////////////////////////////////////////////////////////////////////////////////////////////// - // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for A is always R128c4. - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // M must be a multiple of 128. - // K must be a multiple of 4P. - // The "logical" shape is: [paddedM, K / P], where paddedM is - // sum(divUpMul(M[bi], tileM) for bi in B) if batchM, - // otherwise divUpMul(M, TileM) * B. - // The R128c4 layout is: [paddedM / 128, K / P / 4, 512]. - // The shape we use for TMA is: [paddedM / 128, K / P / 4, 2, 256]. - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfA[1]; +template +bool useTmaOobOptB(BatchedGemmOptions const& options) +{ + return options.mBatchMode == BatchedGemmOptions::BatchMode::BatchN && doesRouteImplUseNoRoute(options.mRouteImpl) + && options.mUseTmaOobOpt; +} - // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of block scaling factors for B is controlled by options.mSfLayoutB. - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // The "logical" shape is: [paddedN, K / 16] - // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B) if batchN, - // otherwise divUpMul(N, TileN) * B. - // - // If the layout is R128c4, - // paddedN must be a multiple of 128. - // K must be a multiple of 4P. - // The R128c4 layout is: [paddedN / 128, K / P / 4, 512] - // The shape we use for TMA is: [paddedN / 128, K / P / 4, 2, 256] - // - // If the layout is R8c4, - // paddedN must be a multiple of 8. - // K must be a multiple of 4P. - // The R8c4 layout is: [paddedN / 8, K / P / 4, 32] - // The shape we use for TMA is: [paddedN / 8, K / P / 4 / repeats, repeats * 32] - // where repeats = min(tileK / P / 4, 8) - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfB[1]; +////////////////////////////////////////////////////////////////////////////////////////////////// - // The input matrix A. - // If (routeAct == true && batchM), the shape is [M, K]. tmaA is not used. - // Otherwise, check layout of tmaA to see the shape and strides. - void const* ptrA{nullptr}; +template +bool useTmaOobOptC(BatchedGemmOptions const& options) +{ + return options.mUseTmaStore && options.mUseTmaOobOpt; +} - // The stride for matrix A in bytes. - // Equals to K * dtypeGetNumBits(dtypeA) / 8. - uint64_t strideInBytesA; +////////////////////////////////////////////////////////////////////////////////////////////////// - // The input matrix B. - // If (routeAct == true && batchN), the shape is [N, K]. tmaB is not used. - // Otherwise, check layout of tmaB to see the shape and strides. - void const* ptrB{nullptr}; - // The stride for matrix B in bytes. - // Equals to K * dtypeGetNumBits(dtypeB) / 8. - uint64_t strideInBytesB; +// Create the TMA shape/stride for A/B/C. +template +static auto makeTmaShapeStrideAbc( + GemmOptions const& options, int mM, int mN, int mK, int tileM, int tileN, int tileK, MatrixType matrixType) +{ + // Weights matrix is A if we transpose the output of MMA (to have it M-major). + // Otherwise, it is B, when the output of MMA is K-major. + bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput) + || (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput); - // The output matrix C. Check "logical" layout of tmaC to see the shape and strides. - void* ptrC{nullptr}; + // Whether to use TMA OOB trick to block out padded dummy tokens and saving BW whenever no routing + // is involved. It applies to batchM and matrixA, or batchN and matrixB, or any case for matrixC. + bool const useTmaOobOpt = matrixType == MatrixType::MatrixA ? useTmaOobOptA(options) + : matrixType == MatrixType::MatrixB ? useTmaOobOptB(options) + : matrixType == MatrixType::MatrixC ? useTmaOobOptC(options) + : false; - // Inputs and output are MxFp{4,8}, Fp8, NvFp4. - // The scaling factors to apply to the output - can be used to incorporate input scaling factors - // as described below: C = SEncC * act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br) - // -> ScaleGate = SDecA * SDecB - // ScaleC = SDecA * SDecB * SEncC - // - // Only the inputs are MxFp{4,8}, Fp8, NvFp4. - // C = act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br) - // -> ScaleGate = SDecA * SDecB - // ScaleC = SDecA * SDecB - // - // Only the output is MxFp{4,8}, Fp8, NvFp4. - // C = SEncC * act(A * Bl) . (A * Br) - // -> ScaleGate = 1 - // ScaleC = SEncC - // - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. - // Shape is [B]. One scaling factor per tensor in batch. - float const* ptrScaleC{nullptr}; + // The outer dimension. + auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN; + // The outer dimension tile size. + auto ctaTileNumTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? tileM : tileN; + // The outer dimension of TMA box shape. + auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM : ctaTileNumTokens; - // The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. - // Shape is [B]. One scaling factor per tensor in batch. - float const* ptrScaleGate{nullptr}; + // The inner dimension. + auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK; + // The inner dimension tile size. + auto ctaTileHiddenSize = (matrixType == MatrixType::MatrixC) ? tileN : tileK; + // The inner dimension of TMA box shape. + auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : ctaTileHiddenSize; - // The alpha and beta for SwiGlu. - // Shape is [B]. One alpha and one beta per tensor in batch. - // Alpha is 1.f if nullptr. - // Beta is 0.f if nullptr. - float const* ptrSwiGluAlpha{nullptr}; - float const* ptrSwiGluBeta{nullptr}; - - // The K dimension. It is the hidden dimension of the input matrices. - int32_t k; - - // The non-batched dimension. - // It is N if batchM, otherwise M. - int32_t nm; - - // Tile stride per batch for the non-batched dimension. - // It is N / TileN if batchM, otherwise M / TileM. - int32_t tileStridePerBatch; - - // TODO get rid of that. - // DeepSeek FP8 scaling factors for C - float* ptrDqSfsC{nullptr}; - - // The block scaling factors for A. - // The pointer must always be set regardless of the quantization recipe. - // If (routeAct == true && batchM), the shape is [M, K / 16]. tmaSfA is not used. - // For the layout (r128c4), see below. - // Otherwise, - // If MxFp{4,8} and NvFp4 formats are used, - // check the "logical" layout of tmaSfA to see the shape and strides. - // The dtype is Dtype::E4m3. - // - // If DeepSeek FP8 quantization recipe is used, - // If batchM: - // The shape is [K / 128, paddedM], - // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B). - // If batchN: - // The shape is [M / 128, K / 128], - // The rightmost dimension is contiguous in memory. - // The dtype is Dtype::Float32. - void const* ptrSfA{nullptr}; - - // The block scaling factors for B. - // The pointer must always be set regardless of the quantization recipe. - // If (routeAct == true && batchN), the shape is [N, K / 16]. tmaSfB is not used. - // For the layout (r128c4, r8c4), see below. - // Otherwise, - // If MxFp{4,8} and NvFp4 formats are used, - // check the layout of tmaSfB to see the shape and strides. - // The dtype is Dtype::E4m3. - // - // If DeepSeek FP8 quantization recipe is used, - // If batchM: - // The shape is [N / 128, K / 128], - // If batchN: - // The shape is [K / 128, paddedN], - // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B). - // The rightmost dimension is contiguous in memory. - // The dtype is Dtype::Float32. - void const* ptrSfB{nullptr}; - - // The per-token scaling factors from scale A. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is not - // transposed). The dtype is Dtype::Bfloat16 - // - // if (batchM (A is activations)): - // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B)] - // - // if (batchN (A is weights)): - // Logical shape is [B, divUpMul(M, tileM)] - // - void const* ptrPerTokenSfA{nullptr}; - - // The per-token scaling factors from scale B. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is - // transposed). The dtype is Dtype::Bfloat16 - // - // if (batchM (B is weights)): - // Logical shape is [B, divUpMul(N, tileN)] - // - // if (batchN (B is activations)): - // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B)] - void const* ptrPerTokenSfB{nullptr}; - - // The bias applied after the GEMM and before the activation function. - // The bias is applied before applying the global scaling factor. I.e. - // C = act(A * B + bias') * scaleC - // scaleC = dequantA * dequantB * quantC - // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. - // - // If batchM, BiasType must be N, and bias shape is [B, N]. - // The bias is broadcasted along the M dimension. - // - // If batchNm BiasType must be M, and bias shape is [B, M]. - // The bias is broadcasted along the N dimension. - // - // The dtype is float32. - void const* ptrBias{nullptr}; - - // The output block scaling factors for C. - // - // If MxFp{4,8} and NvFp4 formats are used, - // The "logical" shape is: - // if batchM: [paddedM, N / 16] - // if batchN: [paddedN, M / 16] - // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B), - // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B). - // - // If the layout is R128c4, - // paddedOuter must be a multiple of 128. - // inner must be a multiple of 64. - // The R128c4 layout is: [paddedOuter / 128, inner / 16 / 4, 512] - // The shape we use for TMA is: [paddedOuter / 128, inner / 16 / 4, 2, 256] - // where inner = N if batchM, otherwise M. - // where paddedOuter = paddedM if batchM, otherwise paddedN. - // - // If the layout is R8c4, - // paddedOuter must be a multiple of 8. - // inner must be a multiple of 64. - // The R8c4 layout is: [paddedOuter / 8, inner / 16 / 4, 32] - // The shape we use for TMA is: [paddedOuter / 8, inner / 16 / 4 / repeats, repeats * 32] - // where repeats = min(tileInner / 16 / 4, 8), - // where tileInner = tileN if batchM, otherwise tileM, - // where paddedOuter = paddedM if batchM, otherwise paddedN. - // where inner = N if batchM, otherwise M. - // - // The dtype is Dtype::E4m3. - // - // If DeepSeek FP8 quantization recipe is used, - // If batchM: - // The shape is [N / 128, paddedM], - // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B). - // If batchN: - // The shape is [M / 128, paddedN], - // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B). - // The rightmost dimension is contiguous in memory. - // The dtype is Dtype::Float32. - void* ptrSfC{nullptr}; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Routing activations parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - // These params are used when the kernel is configured with -routeAct true. - // The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or - // divUpMul(N[bi], tileN) for batchN. - // If -routeAct is false, the params are not used and should be set to zero. - - // The routeMap for the input tokens. - // Map of expanded token index (counting the previous padded tokens) to the batch index - // the token belongs to. - // The shape is - // [sum(divUpMul(M[bi], tileM) for bi in B)] for batchM - // [sum(divUpMul(N[bi], tileN) for bi in B)] for batchN - // The dtype is int32_t. - // - // There are 3 tokens [0, 1, 2] such that [0, 1] belong to batch [B0] and [2] to batch [B1]. - // Let's assume that the padded size is 4. - // - // The expanded indices for tokens [0, 1, 2] are: - // expandedIdx[0] = 0 - // expandedIdx[1] = 1 - // expandedIdx[2] = divUpMul(2, 4) + 0 = 4 - // - // The route map is [B0, B0, X, X, B1, X, X, X] where X could be any value. - int32_t const* ptrRouteMap{nullptr}; - - // Total number of unpadded inputs - int32_t numTokens; - - // Total number of batches - int32_t numBatches; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Batching information parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // In some cases, some CTAs must early-exit. E.g. when the grid size is set statically, but the - // actual workload is decided at runtime. This element on the device contains the number of CTAs - // that do not early-exit. The number corresponds to the X dim of the grid when the output is not - // transposed (i.e. batchM). To the Y dim, otherwise. - // The size is 1 and the dtype is int32_t. - // Used if isStaticBatch == false, otherwise set to nullptr. - // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. - int32_t const* ptrNumNonExitingCtas{nullptr}; - - // Pointer to total number of padded tokens. - // Computed as - // int32_t totalNumPaddedTokens{0}; - // for (int bi = 0; bi < options.mNumBatches; bi++) { - // totalNumPaddedTokens += batchM ? divUpMul(options.mBatchedM[bi], options.mTileM) - // : divUpMul(options.mBatchedN[bi], options.mTileN); - // } - // The size is 1 and the dtype is int32_t. - // If isStaticBatch == true, ptrTotalNumPaddedTokens should be set to nullptr and - // totalNumPaddedTokens is used. - int32_t const* ptrTotalNumPaddedTokens{nullptr}; - - // Pointer to the map from the CTA index (in X/Y dim) to the batch index. - // Maps CTA index in batch dim (i.e. blockDim.x if batchM, otherwise blockDim.y) - // to batch index. - // E.g. with listM = 128,255,32 and tileM = 128, should be equal to - // ctaIdxXyToBatchIdx = [0, 1, 1, 2] - // If isStaticBatch == true, ptrCtaIdxXyToBatchIdx should be set to nullptr and ctaIdxXyToBatchIdx - // is used. - int32_t const* ptrCtaIdxXyToBatchIdx{nullptr}; - - // Pointer from the CTA index X/Y to the expanded tile index where the expanded tile index is - // computed as: - // - // int expandedIdx = 0; - // for (int bi = 0; bi < batchIdx-1; ++bi) { - // expandIdx = divUpMul(numTokens[bi], TileM/N); - // } - // expandIdx += - // E.g. with numTokens = [128,255,32] and tileM = 128, should be equal to - // ptrCtaIdxXyToMnLimit = [128, 256, 383, 416] - int32_t const* ptrCtaIdxXyToMnLimit{nullptr}; - - // Total number of padded tokens - used as the stride for the activation and C scaling factors. - // Check ptrTotalNumPaddedTokens to see how it is computed. - // If isStaticBatch == true, totalNumPaddedTokens is used, otherwise ptrTotalNumPaddedTokens. - int32_t totalNumPaddedTokens; - - // A map from CTA index X/Y to batch index. - // Check ptrCtaIdxXyToBatchIdx to see how it is computed. - // If isStaticBatch == true, ctaIdxXyToBatchIdx is used, otherwise ptrCtaIdxXyToBatchIdx. - int32_t ctaIdxXyToBatchIdx[MaxNumCtas]; - - // **Expanded** limits for the batched dimension: - // tile * ctaIdxXyToTileIdxMn[ctaIdxXy] -> ctaIdxXyToMnLimit[ctaIdxXy] - // Check ptrCtaIdxXyToMnLimit to see how it is computed. - // If isStaticBatch == true, ctaIdxXyToMnLimit is used, otherwise ptrCtaIdxXyToMnLimit. - int32_t ctaIdxXyToMnLimit[MaxNumCtas]; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // All-reduce parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The rank id of the current device in the multi-gpu space. - int rank; - // The number of peer devices in tensor-parallel group. - int tpGrpSize; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // GatedAct parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // Pointer for partial row max for DeepSeek FP8 recipe. - // This is temporary storage for the row max results. - // If batchM, the shape is [2, totalNumPaddedTokens, N / 128] and the dtype is float. - // Otherwise, the shape is [2, totalNumPaddedTokens, M / 128] and the dtype is float. - float* ptrPartialRowMax{nullptr}; - - // Flags in global memory that sync on "exit" for row max computation. - // The shape is [numTilesM * numTilesN / 2] and the dtype is uint32_t, where - // if batchM, - // numTilesM = divUp(totalNumPaddedTokens, tileM). - // numTilesN = divUp(N, tileN). - // Otherwise, - // numTilesM = divUp(M, tileM). - // numTilesN = divUp(totalNumPaddedTokens, tileN). - // - // The memory must be set to 0 before the kernel launch. - uint32_t* ptrRowMaxCompletionBars{nullptr}; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Member functions. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - enum class MatrixType + // Swap matrix C sizes if output is transposed. + if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) { - MatrixA = 0, - MatrixB, - MatrixC - }; - - // Create the TMA shape/stride for A/B/C. - template - static auto makeTmaShapeStrideAbc( - GemmOptions const& options, int mM, int mN, int mK, int tileM, int tileN, int tileK, MatrixType matrixType) - { - // Weights matrix is A if we transpose the output of MMA (to have it M-major). - // Otherwise, it is B, when the output of MMA is K-major. - bool const isWeights = (matrixType == MatrixType::MatrixA && options.mTransposeMmaOutput) - || (matrixType == MatrixType::MatrixB && !options.mTransposeMmaOutput); - - // The outer dimension. - auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? mM : mN; - // The outer dimension tile size. - auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM - : (matrixType == MatrixType::MatrixA) ? tileM - : tileN; - // The inner dimension. - auto hiddenSize = (matrixType == MatrixType::MatrixC) ? mN : mK; - // The inner dimension tile size. - auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN : tileK; - - // Swap matrix C sizes if output is transpose - if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) - { - numTokens = mN; - hiddenSize = mM; - tileNumTokens = options.mEpilogueTileN; - tileHiddenSize = options.mEpilogueTileM; - } - - // For a fused activation kernel, the hidden size of output is halved. TODO: That's true for - // gated activations but not regular activations. - if (options.mFusedAct) - { - if (matrixType == MatrixType::MatrixC) - { - hiddenSize /= 2; - tileHiddenSize /= 2; - } - } - - // The cute tensor shape for A/B: (numTokens, hiddenSize). - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - // If the matrix is a weights matrix, we use 3D logical shape for it (B, M, K) or (B, N, K). - // Ativations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K). - if (isWeights) - { - shape.push_back(static_cast(options.mNumBatches)); - } - - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; - if (isWeights) - { - stride.push_back(static_cast(hiddenSize * numTokens)); - } - - // Assemble the box shape - std::vector tileShape = {tileHiddenSize, tileNumTokens}; - - // Alternate layouts do not apply to matrixC - if (matrixType != MatrixType::MatrixC) - { - gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; - // Note, only the weights support non MajorK layouts - if (layout == gemm::MatrixLayout::MajorMn) - { - // Apply transpose if necessary - std::swap(shape[0], shape[1]); - stride[1] = numTokens; - std::swap(tileShape[0], tileShape[1]); - } - else if (layout == gemm::MatrixLayout::BlockMajorK) - { - // Set shapes based on blocking layout - shape = {static_cast(options.mBlockK), static_cast(numTokens), - static_cast(mK / options.mBlockK), static_cast(options.mNumBatches)}; - stride = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK), - static_cast(hiddenSize * numTokens)}; - - // If blockK > tileK, then the inner most box size will be based on the tile - int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize); - tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK}; - } - } - - return std::make_tuple(shape, stride, tileShape); + std::swap(numTokens, hiddenSize); + std::swap(ctaTileNumTokens, ctaTileHiddenSize); + std::swap(tileNumTokens, tileHiddenSize); } - // Create the TMA shape/stride for A/B block scaling factors. - static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK, - tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor) + // For a fused activation kernel, the hidden size of output is halved. TODO: That's true for + // gated activations but not regular activations. + if (options.mFusedAct && matrixType == MatrixType::MatrixC) { - - // The outer dimension. - auto numTokens = matrixType == MatrixType::MatrixA ? mM : mN; - // The inner dimension. - auto hiddenSize = mK; - // The outer tile dimension. - auto numTokensPerTile = matrixType == MatrixType::MatrixA ? tileM : tileN; - // The inner tile dimension. - auto hiddenSizePerTile = tileK; - // Number of elements per scaling factor. - const int32_t numEltsPerSf = (dtypeElt == tg::Dtype::E2m1) ? 16 : 32; - - switch (layout) - { - case tg::SfLayout::R128c4: - { - // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. - // The 512B block maps to a 32x16B (32x128b) block in TMEM. - // See https://nvbugspro.nvidia.com/bug/4165523 - // - // Additionally, we have to meet constraints of TMA that the box dimensions are less - // than 256 and boxDim[0] is a multiple of 16B. - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The aforementioned format is: [outer / 128, inner / numEltsPerSf / 4, 512] - // The shape we use for TMA is: [outer / 128, inner / numEltsPerSf / 4, 2, 256] - - auto shape = std::vector{256, 2, static_cast(ceilDiv(hiddenSize, numEltsPerSf * 4)), - static_cast(ceilDiv(numTokens, 128))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) - { - stride[i] = shape[i - 1] * stride[i - 1]; - } - - auto tileShapes - = std::vector{256, 2, static_cast(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), - static_cast(ceilDiv(numTokensPerTile, 128))}; - - return std::make_tuple(shape, stride, tileShapes); - } - - case tg::SfLayout::R8c4: - { - // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. - // - // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use - // fewer read requests, if the tile dimensions allow. It does not reduce the number of - // instructions. - // - // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] - // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] - // - // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of - // NumRepeats * numEltsPerSf * 4. - - // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. - int const r = sfReshapeFactor; - if (r > 0 && (r & (r - 1)) != 0) - { - throw std::runtime_error( - "mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); - } - - // Sanitize number of repeats so it doesn't exceed the dimension. - int const repeats = std::min(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); - - // Detect if the input hidden size K is a multiple of the repeats. - if (ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) - { - throw std::runtime_error("SF hiddenSize K (" + std::to_string(ceilDiv(hiddenSize, numEltsPerSf * 4)) - + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); - } - - auto shape = std::vector{static_cast(repeats * 32), - static_cast(ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), - static_cast(ceilDiv(numTokens, 8))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) - { - stride[i] = shape[i - 1] * stride[i - 1]; - } - - auto tileShapes = std::vector{static_cast(repeats * 32), - static_cast(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), - static_cast(ceilDiv(numTokensPerTile, 8))}; - - return std::make_tuple(shape, stride, tileShapes); - } - - default: throw std::runtime_error("Unsupported SF layout"); - } - return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); + hiddenSize /= 2; + tileHiddenSize /= 2; + ctaTileHiddenSize /= 2; } - template - static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, - void const* ptrB, void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, - void const* ptrPerTokenSfB, void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, - float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax, - uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr, - int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr, - int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = MaxNumCtas) + // The cute tensor shape for A/B: (numTokens, hiddenSize). + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + + // Activations matrix is 2D (sum(divUpMul(M[bi], tileM) for bi in B), K). + std::vector shape = {static_cast(hiddenSize), static_cast(numTokens)}; + if (useTmaOobOpt /* also implies input/output activation */) { + // If TMA OOB optimization is used, we use 3D logical shape (M, tileM, K) or (N, tileN, K). + // The outer dimension is extended to make room for the possible counterbalance positive + // offset from the middle "bound" dimension. The counterbalance should be no more than + // ctaTileNumTokens. + shape = {static_cast(hiddenSize), static_cast(ctaTileNumTokens), + static_cast(numTokens + ctaTileNumTokens)}; + } + else if (isWeights) + { + // If the matrix is a weights matrix, we use 3D logical shape (B, M, K) or (B, N, K). + shape = {static_cast(hiddenSize), static_cast(numTokens), + static_cast(options.mNumBatches)}; + } - static_assert(sizeof(KernelParams) <= 32 * 1024, "sizeof(KernelParams) has to be less or equal than 32KB"); + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + std::vector stride = {1, static_cast(hiddenSize)}; + if (useTmaOobOpt) + { + stride = {1, static_cast(hiddenSize), static_cast(hiddenSize)}; + } + else if (isWeights) + { + stride = { + 1, static_cast(hiddenSize), static_cast(hiddenSize) * static_cast(numTokens)}; + } - // Create the return struct. - KernelParams params; + // Assemble the box shape + std::vector tileShape = {tileHiddenSize, tileNumTokens}; - params.ptrRouteMap = routeMap; - params.numTokens = options.mNumTokens; - - params.ptrScaleC = ptrScaleC; - params.ptrScaleGate = ptrScaleGate; - - params.ptrSwiGluAlpha = ptrSwiGluAlpha; - params.ptrSwiGluBeta = ptrSwiGluBeta; - - int32_t ctaOffset = 0; - - // Compute totalNumPaddedTokens, ctaIdxXyToBatchIdx and ctaIdxXyToMnLimit if the batch dims are - // known at kernel launch time. Otherwise, these parameters are defined in the device buffers: - // ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx and ptrCtaIdxXyToMnLimit respectively. - - if (options.mIsStaticBatch) + // Alternate layouts (MajorMn and BlockMajorK) do not apply to matrixC + if (matrixType != MatrixType::MatrixC) + { + gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; + // Note, only the weights support non MajorK layouts + if (layout == gemm::MatrixLayout::MajorMn) { - params.totalNumPaddedTokens = 0; - for (int b = 0; b < options.mNumBatches; b++) + // Apply transpose if necessary + std::swap(shape[0], shape[1]); + stride[1] = numTokens; + std::swap(tileShape[0], tileShape[1]); + } + else if (layout == gemm::MatrixLayout::BlockMajorK) + { + // Set shapes based on blocking layout + shape = {static_cast(options.mBlockK), static_cast(numTokens), + static_cast(mK / options.mBlockK), static_cast(options.mNumBatches)}; + stride = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK), + static_cast(hiddenSize * numTokens)}; + + // If blockK > tileK, then the inner most box size will be based on the tile + int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize); + tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK}; + } + } + + return std::make_tuple(shape, stride, tileShape); +} + +// Create the TMA shape/stride for A/B block scaling factors. +static auto makeTmaShapeStrideSfAb(int mM, int mN, int mK, MatrixType matrixType, int tileM, int tileN, int tileK, + tg::Dtype dtypeElt, tg::SfLayout layout, int sfReshapeFactor) +{ + + // The outer dimension. + auto numTokens = matrixType == MatrixType::MatrixA ? mM : mN; + // The inner dimension. + auto hiddenSize = mK; + // The outer tile dimension. + auto numTokensPerTile = matrixType == MatrixType::MatrixA ? tileM : tileN; + // The inner tile dimension. + auto hiddenSizePerTile = tileK; + // Number of elements per scaling factor. + const int32_t numEltsPerSf = (dtypeElt == tg::Dtype::E2m1) ? 16 : 32; + + switch (layout) + { + case tg::SfLayout::R128c4: + { + // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. + // The 512B block maps to a 32x16B (32x128b) block in TMEM. + // See https://nvbugspro.nvidia.com/bug/4165523 + // + // Additionally, we have to meet constraints of TMA that the box dimensions are less + // than 256 and boxDim[0] is a multiple of 16B. + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The aforementioned format is: [outer / 128, inner / numEltsPerSf / 4, 512] + // The shape we use for TMA is: [outer / 128, inner / numEltsPerSf / 4, 2, 256] + + auto shape = std::vector{256, 2, static_cast(ceilDiv(hiddenSize, numEltsPerSf * 4)), + static_cast(ceilDiv(numTokens, 128))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) + { + stride[i] = shape[i - 1] * stride[i - 1]; + } + + auto tileShapes + = std::vector{256, 2, static_cast(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), + static_cast(ceilDiv(numTokensPerTile, 128))}; + + return std::make_tuple(shape, stride, tileShapes); + } + + case tg::SfLayout::R8c4: + { + // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. + // + // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use + // fewer read requests, if the tile dimensions allow. It does not reduce the number of + // instructions. + // + // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] + // + // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of + // NumRepeats * numEltsPerSf * 4. + + // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. + int const r = sfReshapeFactor; + if (r > 0 && (r & (r - 1)) != 0) + { + throw std::runtime_error("mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); + } + + // Sanitize number of repeats so it doesn't exceed the dimension. + int const repeats = std::min(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); + + // Detect if the input hidden size K is a multiple of the repeats. + if (ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) + { + throw std::runtime_error("SF hiddenSize K (" + std::to_string(ceilDiv(hiddenSize, numEltsPerSf * 4)) + + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); + } + + auto shape = std::vector{static_cast(repeats * 32), + static_cast(ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), + static_cast(ceilDiv(numTokens, 8))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) + { + stride[i] = shape[i - 1] * stride[i - 1]; + } + + auto tileShapes = std::vector{static_cast(repeats * 32), + static_cast(ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), + static_cast(ceilDiv(numTokensPerTile, 8))}; + + return std::make_tuple(shape, stride, tileShapes); + } + + default: throw std::runtime_error("Unsupported SF layout"); + } + return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); +} + +template +static KernelParams setKernelParams(GemmOptions_ const& options, bool const batchM, void const* ptrA, void const* ptrB, + void* ptrC, void const* dSfA, void const* dSfB, void const* ptrPerTokenSfA, void const* ptrPerTokenSfB, + void const* ptrBias, void* dSfC, float const* ptrScaleC, float const* ptrScaleGate, float const* ptrClampLimit, + float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, int32_t const* routeMap, float* rowMax, + uint32_t* rowMaxBars, int32_t const* ptrNumNonExitingCtas = nullptr, + int32_t const* ptrTotalNumPaddedTokens = nullptr, int32_t const* ptrCtaIdxXyToBatchIdx = nullptr, + int32_t const* ptrCtaIdxXyToMnLimit = nullptr, int32_t const maxNumCtas = KernelParams::MaxNumCtas) +{ + + static_assert(sizeof(KernelParams) <= 32 * 1024, "sizeof(KernelParams) has to be less or equal than 32KB"); + + // Create the return struct. + KernelParams params; + + params.ptrRouteMap = routeMap; + params.numTokens = options.mNumTokens; + + params.ptrScaleC = ptrScaleC; + params.ptrScaleGate = ptrScaleGate; + params.ptrClampLimit = ptrClampLimit; + params.ptrSwiGluAlpha = ptrSwiGluAlpha; + params.ptrSwiGluBeta = ptrSwiGluBeta; + + int32_t ctaOffset = 0; + + // Compute totalNumPaddedTokens, ctaIdxXyToBatchIdx and ctaIdxXyToMnLimit if the batch dims are + // known at kernel launch time. Otherwise, these parameters are defined in the device buffers: + // ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx and ptrCtaIdxXyToMnLimit respectively. + + if (options.mIsStaticBatch) + { + params.totalNumPaddedTokens = 0; + for (int b = 0; b < options.mNumBatches; b++) + { + + int mM = batchM ? options.mBatchedM[b] : options.mM; + int mN = batchM ? options.mN : options.mBatchedN[b]; + + // Skip Tma descriptor creation if expert isn't used + if (mM == 0 || mN == 0) + { + continue; + } + + // The number of CTAs. + int32_t numCtas + = batchM ? (mM + options.mTileM - 1) / options.mTileM : (mN + options.mTileN - 1) / options.mTileN; + // The size of the tile. + int32_t tile = batchM ? options.mTileM : options.mTileN; + // The problem size. + int32_t mn = batchM ? mM : mN; + int32_t tokensPerTile = mn; + + // Make sure we do not exceed the launch limit. + if (ctaOffset + numCtas > KernelParams::MaxNumCtas) + { + throw std::runtime_error("Too many CTAs"); + } + + for (int32_t cta = 0; cta < numCtas; cta++) + { + params.ctaIdxXyToBatchIdx[ctaOffset + cta] = b; + // This is now an identity map and it is no longer needed. + // params.ctaIdxXyToTileIdxMn[ctaOffset + cta] = ctaOffset + cta; + params.ctaIdxXyToMnLimit[ctaOffset + cta] + = std::min((ctaOffset + cta + 1) * tile, ctaOffset * tile + tokensPerTile); + } + ctaOffset += numCtas; + + params.totalNumPaddedTokens += numCtas * tile; + } + } + else + { + params.ptrTotalNumPaddedTokens = ptrTotalNumPaddedTokens; + params.ptrCtaIdxXyToBatchIdx = ptrCtaIdxXyToBatchIdx; + params.ptrCtaIdxXyToMnLimit = ptrCtaIdxXyToMnLimit; + ctaOffset = maxNumCtas; + } + + if (options.mUseDeepSeekFp8 && options.mDtypeC == tg::Dtype::E4m3) + { + params.ptrDqSfsC = reinterpret_cast(dSfC); + } + + params.ptrA = ptrA; + params.ptrB = ptrB; + params.strideInBytesA = options.mK * tg::dtypeGetNumBits(options.mDtypeA) / 8; + params.strideInBytesB = options.mK * tg::dtypeGetNumBits(options.mDtypeB) / 8; + + params.ptrSfA = dSfA; + params.ptrSfB = dSfB; + params.ptrSfC = dSfC; + + if (!batchM) + { + // A is the expert + if (0 != options.mM % options.mTileM) + { + throw std::runtime_error("0 == mM %% tileM"); + } + params.tileStridePerBatch = options.mM / options.mTileM; + params.nm = options.mM; + // Shape/stride for gmem tensor A. + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, + options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA); + // Build tma descriptor for A. + params.tmaA[0] = gemm::buildNdTmaDescriptor( + options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); + + // The input is padded: + // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] + auto const inputNumTokens = ctaOffset * options.mTileN; + + if (!batchedGemm::doesRouteImplUseLdgsts(options.mRouteImpl)) + { + bool useRouteAct = batchedGemm::doesRouteImplUseTma(options.mRouteImpl); + // B is the activation + // Shape/stride for gmem tensor B. + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, + useRouteAct ? options.mNumTokens : inputNumTokens, options.mK, options.mTileM, + (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB); + // Build tma descriptor for B. + params.tmaB[0] = gemm::buildNdTmaDescriptor( + options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); + } + + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 + || options.mDtypeA == tg::Dtype::MxE2m1) + { + tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + // Build TMA descriptor for gmem A block scaling factors. + auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, + options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, + options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor); + params.tmaSfA[0] + = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); + } + + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3 + || options.mDtypeB == tg::Dtype::MxE2m1) + { + tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + if (batchedGemm::doesRouteImplUseTma(options.mRouteImpl)) { - int mM = batchM ? options.mBatchedM[b] : options.mN; - int mN = batchM ? options.mM : options.mBatchedN[b]; + // The input is NOT padded: + // [act0, act1, act2, ...] - // Skip Tma descriptor creation if expert isn't used - if (mM == 0 || mN == 0) - { - continue; - } + // Build TMA descriptor for gmem B block scaling factors. + int32_t const numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeB); + // Pad number of scaling factors to the nearest multiple of 16 because of the TMA 16B + // alignment requirement. + auto numSfsInK = options.mK / numEltsPerSf; + numSfsInK = ceilDiv(numSfsInK, 16) * 16; - // The number of CTAs. - int32_t numCtas - = batchM ? (mM + options.mTileM - 1) / options.mTileM : (mN + options.mTileN - 1) / options.mTileN; - // The size of the tile. - int32_t tile = batchM ? options.mTileM : options.mTileN; - // The problem size. - int32_t mn = batchM ? mM : mN; - int32_t tokensPerTile = mn; - - // Make sure we do not exceed the launch limit. - if (ctaOffset + numCtas > MaxNumCtas) - { - throw std::runtime_error("Too many CTAs"); - } - - for (int32_t cta = 0; cta < numCtas; cta++) - { - params.ctaIdxXyToBatchIdx[ctaOffset + cta] = b; - // This is now an identity map and it is no longer needed. - // params.ctaIdxXyToTileIdxMn[ctaOffset + cta] = ctaOffset + cta; - params.ctaIdxXyToMnLimit[ctaOffset + cta] - = std::min((ctaOffset + cta + 1) * tile, ctaOffset * tile + tokensPerTile); - } - ctaOffset += numCtas; - - params.totalNumPaddedTokens += numCtas * tile; + auto [shapeSfB, strideSfB, tileShapesSfB] + = makeTmaShapeStrideAbc(options, options.mM, options.mNumTokens, numSfsInK, options.mTileM, + 1 /* tileN */, options.mTileK / numEltsPerSf, MatrixType::MatrixB); + params.tmaSfB[0] = gemm::buildNdTmaDescriptor(dTypeSf, options.mMmaKind, shapeSfB, strideSfB, + tileShapesSfB, const_cast(dSfB), + /*doSwizzle*/ true); } + else if (batchedGemm::doesRouteImplUseNoRoute(options.mRouteImpl)) + { + + // The input is padded: + // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] + + auto const inputNumTokensSfB = ctaOffset * options.mTileN; + + // Build TMA descriptor for gmem B block scaling factors. + auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB, + options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, options.mDtypeB, + options.mSfLayoutB, options.mSfReshapeFactor); + params.tmaSfB[0] + = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); + } + } + + // C is the output activation + if (options.mUseTmaStore) + { + // Shape/stride for gmem tensor C. + auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, options.mM, ctaOffset * options.mTileN, + options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC); + // Build tma descriptor for C. + params.tmaC[0] + = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); } else { - params.ptrTotalNumPaddedTokens = ptrTotalNumPaddedTokens; - params.ptrCtaIdxXyToBatchIdx = ptrCtaIdxXyToBatchIdx; - params.ptrCtaIdxXyToMnLimit = ptrCtaIdxXyToMnLimit; - ctaOffset = maxNumCtas; + params.ptrC = ptrC; } - - if (options.mUseDeepSeekFp8 && options.mDtypeC == tg::Dtype::E4m3) + } + else + { + // B is the expert + if (0 != options.mN % options.mTileN) { - params.ptrDqSfsC = reinterpret_cast(dSfC); + throw std::runtime_error("0 == mN %% tileN"); } + params.tileStridePerBatch = options.mN / options.mTileN; + params.nm = options.mN; + // Shape/stride for gmem tensor B. + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, + options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixB); + // Build tma descriptor for B. + params.tmaB[0] = gemm::buildNdTmaDescriptor( + options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); - params.ptrA = ptrA; - params.ptrB = ptrB; - params.strideInBytesA = options.mK * tg::dtypeGetNumBits(options.mDtypeA) / 8; - params.strideInBytesB = options.mK * tg::dtypeGetNumBits(options.mDtypeB) / 8; - - params.ptrSfA = dSfA; - params.ptrSfB = dSfB; - params.ptrSfC = dSfC; - - if (!batchM) + if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute) { - // A is the expert - if (0 != options.mM % options.mTileM) - { - throw std::runtime_error("0 == mM %% tileM"); - } - params.tileStridePerBatch = options.mM / options.mTileM; - params.nm = options.mM; + // A is the activation // Shape/stride for gmem tensor A. - auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, + // The input is padded: + // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...] + auto const inputNumTokens = ctaOffset * options.mTileM; + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, inputNumTokens, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA); // Build tma descriptor for A. params.tmaA[0] = gemm::buildNdTmaDescriptor( options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); - - // The input is padded: - // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] - auto const inputNumTokens = ctaOffset * options.mTileN; - - if (!batchedGemm::doesRouteImplUseLdgsts(options.mRouteImpl)) - { - bool useRouteAct = batchedGemm::doesRouteImplUseTma(options.mRouteImpl); - // B is the activation - // Shape/stride for gmem tensor B. - auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, - useRouteAct ? options.mNumTokens : inputNumTokens, options.mK, options.mTileM, - (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB); - // Build tma descriptor for B. - params.tmaB[0] = gemm::buildNdTmaDescriptor( - options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); - } - - if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 - || options.mDtypeA == tg::Dtype::MxE2m1) - { - tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(options.mM * options.mNumBatches, - options.mN, options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, - options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor); - params.tmaSfA[0] - = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); - } - - if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3 - || options.mDtypeB == tg::Dtype::MxE2m1) - { - tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - if (batchedGemm::doesRouteImplUseTma(options.mRouteImpl)) - { - - // The input is NOT padded: - // [act0, act1, act2, ...] - - // Build TMA descriptor for gmem B block scaling factors. - int32_t const numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeB); - // Pad number of scaling factors to the nearest multiple of 16 because of the TMA 16B - // alignment requirement. - auto numSfsInK = options.mK / numEltsPerSf; - numSfsInK = ceilDiv(numSfsInK, 16) * 16; - - auto [shapeSfB, strideSfB, tileShapesSfB] - = makeTmaShapeStrideAbc(options, options.mM, options.mNumTokens, numSfsInK, options.mTileM, - 1 /* tileN */, options.mTileK / numEltsPerSf, MatrixType::MatrixB); - params.tmaSfB[0] = gemm::buildNdTmaDescriptor(dTypeSf, options.mMmaKind, shapeSfB, strideSfB, - tileShapesSfB, const_cast(dSfB), - /*doSwizzle*/ true); - } - else if (batchedGemm::doesRouteImplUseNoRoute(options.mRouteImpl)) - { - - // The input is padded: - // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] - - auto const inputNumTokensSfB = ctaOffset * options.mTileN; - - // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, inputNumTokensSfB, - options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, options.mTileK, - options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor); - params.tmaSfB[0] = gemm::buildSfTmaDescriptor( - dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); - } - } - - // C is the output activation - if (options.mUseTmaStore) - { - // Shape/stride for gmem tensor C. - auto [shapeC, strideC, tileShapeC] - = makeTmaShapeStrideAbc(options, options.mM, ctaOffset * options.mTileN, options.mK, options.mTileM, - options.mTileN, options.mTileK, MatrixType::MatrixC); - // Build tma descriptor for C. - params.tmaC[0] - = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); - } - else - { - params.ptrC = ptrC; - } } - else + + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 + || options.mDtypeA == tg::Dtype::MxE2m1) { - // B is the expert - if (0 != options.mN % options.mTileN) - { - throw std::runtime_error("0 == mN %% tileN"); - } - params.tileStridePerBatch = options.mN / options.mTileN; - params.nm = options.mN; - // Shape/stride for gmem tensor B. - auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, - options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixB); - // Build tma descriptor for B. - params.tmaB[0] = gemm::buildNdTmaDescriptor( - options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); + tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute) { - // A is the activation - // Shape/stride for gmem tensor A. + // The input is padded: // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...] - auto const inputNumTokens = ctaOffset * options.mTileM; - auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, inputNumTokens, options.mN, - options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA); - // Build tma descriptor for A. - params.tmaA[0] = gemm::buildNdTmaDescriptor( - options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); - } + auto const inputNumTokensSfA = ctaOffset * options.mTileM; - if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 - || options.mDtypeA == tg::Dtype::MxE2m1) - { - tg::Dtype const dTypeSf = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute) - { - - // The input is padded: - // [act0, padding, padding, ... tileM size .., act1, padding, padding, ...] - auto const inputNumTokensSfA = ctaOffset * options.mTileM; - - // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, - options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, - options.mDtypeA, tg::SfLayout::R128c4, options.mSfReshapeFactor); - params.tmaSfA[0] = gemm::buildSfTmaDescriptor( - dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); - } - } - - if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3 - || options.mDtypeB == tg::Dtype::MxE2m1) - { - tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, - options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, - options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor); - params.tmaSfB[0] - = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); - } - - // C is the output activation - if (options.mUseTmaStore) - { - // Shape/stride for gmem tensor C. - auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, ctaOffset * options.mTileM, - options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC); - // Build tma descriptor for C. - params.tmaC[0] - = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); - } - else - { - params.ptrC = ptrC; + // Build TMA descriptor for gmem A block scaling factors. + auto [shapeSfA, strideSfA, tileShapesSfA] = makeTmaShapeStrideSfAb(inputNumTokensSfA, options.mN, + options.mK, MatrixType::MatrixA, options.mTileM, options.mTileN, options.mTileK, options.mDtypeA, + tg::SfLayout::R128c4, options.mSfReshapeFactor); + params.tmaSfA[0] + = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(dSfA)); } } - params.k = options.mK; - params.numBatches = options.mNumBatches; + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3 + || options.mDtypeB == tg::Dtype::MxE2m1) + { + tg::Dtype const dTypeSf = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - params.rank = 0; - params.tpGrpSize = 1; + // Build TMA descriptor for gmem B block scaling factors. + auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideSfAb(options.mM, + options.mN * options.mNumBatches, options.mK, MatrixType::MatrixB, options.mTileM, options.mTileN, + options.mTileK, options.mDtypeB, options.mSfLayoutB, options.mSfReshapeFactor); + params.tmaSfB[0] + = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB)); + } - params.ptrPartialRowMax = rowMax; - params.ptrRowMaxCompletionBars = rowMaxBars; - - params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; - - // Set the per-token scale factors for MetaFP8 or scale inputs - params.ptrPerTokenSfA = ptrPerTokenSfA; - params.ptrPerTokenSfB = ptrPerTokenSfB; - params.ptrBias = ptrBias; - - return params; + // C is the output activation + if (options.mUseTmaStore) + { + // Shape/stride for gmem tensor C. + auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, ctaOffset * options.mTileM, options.mN, + options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC); + // Build tma descriptor for C. + params.tmaC[0] + = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); + } + else + { + params.ptrC = ptrC; + } } + + params.k = options.mK; + params.numBatches = options.mNumBatches; + + params.rank = 0; + params.tpGrpSize = 1; + + params.ptrPartialRowMax = rowMax; + params.ptrRowMaxCompletionBars = rowMaxBars; + + params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; + + // Set the per-token scale factors for MetaFP8 or scale inputs + params.ptrPerTokenSfA = ptrPerTokenSfA; + params.ptrPerTokenSfB = ptrPerTokenSfB; + params.ptrBias = ptrBias; + + return params; +} #endif -}; +}; // namespace KernelParamsSetup //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace batchedGemm +} // namespace batchedGemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h new file mode 100644 index 0000000000..2dfb0a1894 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h @@ -0,0 +1,547 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace batchedGemm +{ + +// This is device code + +struct KernelParams +{ + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // BatchedGemm parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // Maximum number of CTAs + static constexpr int MaxNumCtas = 2048; + + // NOTE: TMA out-of-bounds optimization for MoE padded tokens: + // + // Originally the padded tokens is a 2D tensor [hiddenDim, ctaGridDimY * tileN] with stride [1, + // hiddenDim] and box size [tileM, tileN] at pointer p. We waste bandwidth bytes since we only + // want to load [0, batchEnd) out of the [0, tileN) box size: batchEnd is a runtime variable while + // box size needs to be fixed at compile time. + // + // To deal with this, we reshape the tensor to 3D: [hiddenDim, tileN, ctaGridDimY * tileN] with + // stride [1, hiddenDim, hiddenDim] and box size [tileM, tileN, 1]. For the original 2D + // tensor, + // + // Offset Coords [ : , ctaIdxY * tileN ], + // Box Sizes [ : , tileN ], + // Coords Range [ : , ctaIdxY * tileN : ctaIdxY * tileN + tileN], + // + // while we only want load the range [ctaIdxY * tileN, ctaIdxY * tileN + batchEnd), 1 <= batchEnd + // <= tileN + // + // For the reshaped 3D tensor, + // + // Offset Coords [ : , tileN - batchEnd , + // ctaIdxY * tileN + batchEnd ], + // Box Sizes [ : , tileN , + // 1 ], + // Coords Range [ : , tileN - batchEnd : min(tileN, 2 * tileN - batchEnd), + // ctaIdxY * tileN + batchEnd : ctaIdx * tileN + batchEnd + 1], + // + // while min(tileN, 2 * tileN - batchEnd) always evaluates to tileN. The unwanted tokens are + // essentially filtered out by utilizing the OOB feature of TMA. Since the 2nd and 3rd dimension + // has the same stride, we end up loading the following (adding the left and right end of the 2nd + // and 3rd dimension ranges): + // + // Effective 2D Coords Range + // [ : , tileN + ctaIdxY * tileN : tileN + ctaIdxY * tileN + batchEnd], + // + // This is exactly the same as the original range except for the offset tileN, thus we also need + // to offset the pointer in the opposite direction: + // + // Ptr (p) -> Ptr (p - tileN * hiddenDim) + // + // Due to the restrictions of TMA unit, the above operations requires the TMA descriptor and the + // underlying buffer be constructed differently: + // - Requires valid buffer at (p - tileN * hidden) - needs prepending `tileN` tokens. + // - TMA outermost dimension must be extended by `tileN` or loads will OOB in the rightmost side. + // The latter is because when batchEnd == tileN, the offset coords in the 3rd dimension becomes + // ctaIdxY * tileN + tileN. When ctaIdxY = ctaGridDimY - 1, it becomes ((ctaGridDimY - 1) * tileN + // + tileN = ctaGridDimY * tileN which is equal to the 3rd dimension size and will be filtered + // out. That's why we need to extend the tensor size by tileN. + // + // TMA descriptor for A. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAbc. + // + // If batchM: + // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. + // + // If batchN: + // If layoutA is MatrixLayout::MajorK + // Logical shape is [B, divUpMul(M, tileM), K]. + // Logical strides are [divUpMul(M, tileM) * K, K, 1]. + // Tile box shape is [1, tileM, tileK]. + // Tile box strides are [0, tileK, 1]. + // If layoutA is MatrixLayout::Mn + // Logical shape is [B, K, divUpMul(M, tileM)]. + // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM), 1]. + // Tile box shape is [1, tileK, tileM]. + // Tile box strides are [0, tileM, 1]. + // If layoutA is MatrixLayout::BlockMajorK + // Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK]. + // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM) * blockK, blockK, 1]. + // Tile box shape is [1, tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. + // Tile box strides are [0, tileM * min(blockK, tileK), min(blockK, tileK), 1]. + // where blockK is 128B. + // + // Dtype is set from options.mDtypeA. + CUtensorMap tmaA[1]; + + // TMA descriptor for B. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAbc. + // + // If batchM: + // If layoutB is MatrixLayout::MajorK + // Logical shape is [B, divUpMul(N, tileN), K]. + // Logical strides are [divUpMul(N, tileN) * K, K, 1]. + // Tile box shape is [1, tileN, tileK]. + // Tile box strides are [0, tileK, 1]. + // If layoutB is MatrixLayout::MajorMn + // Logical shape is [B, K, divUpMul(N, tileN)]. + // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN), 1]. + // Tile box shape is [1, tileK, tileN]. + // Tile box strides are [0, tileN, 1]. + // If layoutB is MatrixLayout::BlockMajorK + // Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK]. + // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN) * blockK, blockK, 1]. + // Tile box shape is [1, tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. + // Tile box strides are [0, tileN * min(blockK, tileK), min(blockK, tileK), 1]. + // where blockK is 128B. + // + // If batchN: + // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. + // + // Dtype is set from options.mDtypeB. + CUtensorMap tmaB[1]; + + // TMA descriptor for C, (when useTmaStore is true) + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAbc. + // + // If batchM: + // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), N]. + // Logical strides are [N, 1]. + // Tile box shape is [epilogueTileM, epilogueTileN]. + // Tile box strides are [epilogueTileN, 1]. + // + // If batchN: + // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), M]. + // Logical strides are [M, 1]. + // Tile box shape is [epilogueTileN, epilogueTileM]. + // Tile box strides are [epilogueTileM, 1]. + // + // Dtype is set from options.mDtypeC. + CUtensorMap tmaC[1]; + + // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for A is always R128c4. + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // M must be a multiple of 128. + // K must be a multiple of 4P. + // The "logical" shape is: [paddedM, K / P], where paddedM is + // sum(divUpMul(M[bi], tileM) for bi in B) if batchM, + // otherwise divUpMul(M, TileM) * B. + // The R128c4 layout is: [paddedM / 128, K / P / 4, 512]. + // The shape we use for TMA is: [paddedM / 128, K / P / 4, 2, 256]. + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfA[1]; + + // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of block scaling factors for B is controlled by options.mSfLayoutB. + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // The "logical" shape is: [paddedN, K / 16] + // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B) if batchN, + // otherwise divUpMul(N, TileN) * B. + // + // If the layout is R128c4, + // paddedN must be a multiple of 128. + // K must be a multiple of 4P. + // The R128c4 layout is: [paddedN / 128, K / P / 4, 512] + // The shape we use for TMA is: [paddedN / 128, K / P / 4, 2, 256] + // + // If the layout is R8c4, + // paddedN must be a multiple of 8. + // K must be a multiple of 4P. + // The R8c4 layout is: [paddedN / 8, K / P / 4, 32] + // The shape we use for TMA is: [paddedN / 8, K / P / 4 / repeats, repeats * 32] + // where repeats = min(tileK / P / 4, 8) + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfB[1]; + + // The input matrix A. + // If (routeAct == true && batchM), the shape is [M, K]. tmaA is not used. + // Otherwise, check layout of tmaA to see the shape and strides. + void const* ptrA{nullptr}; + + // The stride for matrix A in bytes. + // Equals to K * dtypeGetNumBits(dtypeA) / 8. + uint64_t strideInBytesA; + + // The input matrix B. + // If (routeAct == true && batchN), the shape is [N, K]. tmaB is not used. + // Otherwise, check layout of tmaB to see the shape and strides. + void const* ptrB{nullptr}; + // The stride for matrix B in bytes. + // Equals to K * dtypeGetNumBits(dtypeB) / 8. + uint64_t strideInBytesB; + + // The output matrix C. Check "logical" layout of tmaC to see the shape and strides. + void* ptrC{nullptr}; + + // Inputs and output are MxFp{4,8}, Fp8, NvFp4. + // The scaling factors to apply to the output - can be used to incorporate input scaling factors + // as described below: C = SEncC * act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br) + // -> ScaleGate = SDecA * SDecB + // ScaleC = SDecA * SDecB * SEncC + // + // Only the inputs are MxFp{4,8}, Fp8, NvFp4. + // C = act(SDecA * SDecB * A * Bl) . (SDecA * SDecB * A * Br) + // -> ScaleGate = SDecA * SDecB + // ScaleC = SDecA * SDecB + // + // Only the output is MxFp{4,8}, Fp8, NvFp4. + // C = SEncC * act(A * Bl) . (A * Br) + // -> ScaleGate = 1 + // ScaleC = SEncC + // + // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. + // TensorRT-LLM API requires a scaling factor on the device. + // Shape is [B]. One scaling factor per tensor in batch. + float const* ptrScaleC{nullptr}; + + // The output gate scale for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. + // TensorRT-LLM API requires a scaling factor on the device. + // Shape is [B]. One scaling factor per tensor in batch. + float const* ptrScaleGate{nullptr}; + + // The clamp limit before the activation. + // Shape is [B]. + // Clamp is INF if nullptr. + // If applied on SwiGlu, it will be: + // + // x_glu = x_glu.clamp(min=None, max=limit) + // x_linear = x_linear.clamp(min=-limit, max=limit) + float const* ptrClampLimit{nullptr}; + + // The alpha and beta for SwiGlu. + // Shape is [B]. One alpha and one beta per tensor in batch. + // Alpha is 1.f if nullptr. + // Beta is 0.f if nullptr. + // The formula: + // + // out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta) + float const* ptrSwiGluAlpha{nullptr}; + float const* ptrSwiGluBeta{nullptr}; + + // The K dimension. It is the hidden dimension of the input matrices. + int32_t k; + + // The non-batched dimension. + // It is N if batchM, otherwise M. + int32_t nm; + + // Tile stride per batch for the non-batched dimension. + // It is N / TileN if batchM, otherwise M / TileM. + int32_t tileStridePerBatch; + + // TODO get rid of that. + // DeepSeek FP8 scaling factors for C + float* ptrDqSfsC{nullptr}; + + // The block scaling factors for A. + // The pointer must always be set regardless of the quantization recipe. + // If (routeAct == true && batchM), the shape is [M, K / 16]. tmaSfA is not used. + // For the layout (r128c4), see below. + // Otherwise, + // If MxFp{4,8} and NvFp4 formats are used, + // check the "logical" layout of tmaSfA to see the shape and strides. + // The dtype is Dtype::E4m3. + // + // If DeepSeek FP8 quantization recipe is used, + // If batchM: + // The shape is [K / 128, paddedM], + // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B). + // If batchN: + // The shape is [M / 128, K / 128], + // The rightmost dimension is contiguous in memory. + // The dtype is Dtype::Float32. + void const* ptrSfA{nullptr}; + + // The block scaling factors for B. + // The pointer must always be set regardless of the quantization recipe. + // If (routeAct == true && batchN), the shape is [N, K / 16]. tmaSfB is not used. + // For the layout (r128c4, r8c4), see below. + // Otherwise, + // If MxFp{4,8} and NvFp4 formats are used, + // check the layout of tmaSfB to see the shape and strides. + // The dtype is Dtype::E4m3. + // + // If DeepSeek FP8 quantization recipe is used, + // If batchM: + // The shape is [N / 128, K / 128], + // If batchN: + // The shape is [K / 128, paddedN], + // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B). + // The rightmost dimension is contiguous in memory. + // The dtype is Dtype::Float32. + void const* ptrSfB{nullptr}; + + // The per-token scaling factors from scale A. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is not + // transposed). The dtype is Dtype::Bfloat16 + // + // if (batchM (A is activations)): + // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B)] + // + // if (batchN (A is weights)): + // Logical shape is [B, divUpMul(M, tileM)] + // + void const* ptrPerTokenSfA{nullptr}; + + // The per-token scaling factors from scale B. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is + // transposed). The dtype is Dtype::Bfloat16 + // + // if (batchM (B is weights)): + // Logical shape is [B, divUpMul(N, tileN)] + // + // if (batchN (B is activations)): + // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B)] + void const* ptrPerTokenSfB{nullptr}; + + // The bias applied after the GEMM and before the activation function. + // The bias is applied before applying the global scaling factor. I.e. + // C = act(A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // If batchM, BiasType must be N, and bias shape is [B, N]. + // The bias is broadcasted along the M dimension. + // + // If batchNm BiasType must be M, and bias shape is [B, M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* ptrBias{nullptr}; + + // The output block scaling factors for C. + // + // If MxFp{4,8} and NvFp4 formats are used, + // The "logical" shape is: + // if batchM: [paddedM, N / 16] + // if batchN: [paddedN, M / 16] + // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B), + // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B). + // + // If the layout is R128c4, + // paddedOuter must be a multiple of 128. + // inner must be a multiple of 64. + // The R128c4 layout is: [paddedOuter / 128, inner / 16 / 4, 512] + // The shape we use for TMA is: [paddedOuter / 128, inner / 16 / 4, 2, 256] + // where inner = N if batchM, otherwise M. + // where paddedOuter = paddedM if batchM, otherwise paddedN. + // + // If the layout is R8c4, + // paddedOuter must be a multiple of 8. + // inner must be a multiple of 64. + // The R8c4 layout is: [paddedOuter / 8, inner / 16 / 4, 32] + // The shape we use for TMA is: [paddedOuter / 8, inner / 16 / 4 / repeats, repeats * 32] + // where repeats = min(tileInner / 16 / 4, 8), + // where tileInner = tileN if batchM, otherwise tileM, + // where paddedOuter = paddedM if batchM, otherwise paddedN. + // where inner = N if batchM, otherwise M. + // + // The dtype is Dtype::E4m3. + // + // If DeepSeek FP8 quantization recipe is used, + // If batchM: + // The shape is [N / 128, paddedM], + // where paddedM is sum(divUpMul(M[bi], tileM) for bi in B). + // If batchN: + // The shape is [M / 128, paddedN], + // where paddedN is sum(divUpMul(N[bi], tileN) for bi in B). + // The rightmost dimension is contiguous in memory. + // The dtype is Dtype::Float32. + void* ptrSfC{nullptr}; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Routing activations parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + // These params are used when the kernel is configured with -routeAct true. + // The inputs are not padded, but the outputs are padded to divUpMul(M[bi], tileM) for batchM or + // divUpMul(N[bi], tileN) for batchN. + // If -routeAct is false, the params are not used and should be set to zero. + + // The routeMap for the input tokens. + // Map of expanded token index (counting the previous padded tokens) to the batch index + // the token belongs to. + // The shape is + // [sum(divUpMul(M[bi], tileM) for bi in B)] for batchM + // [sum(divUpMul(N[bi], tileN) for bi in B)] for batchN + // The dtype is int32_t. + // + // There are 3 tokens [0, 1, 2] such that [0, 1] belong to batch [B0] and [2] to batch [B1]. + // Let's assume that the padded size is 4. + // + // The expanded indices for tokens [0, 1, 2] are: + // expandedIdx[0] = 0 + // expandedIdx[1] = 1 + // expandedIdx[2] = divUpMul(2, 4) + 0 = 4 + // + // The route map is [B0, B0, X, X, B1, X, X, X] where X could be any value. + int32_t const* ptrRouteMap{nullptr}; + + // Total number of unpadded inputs + int32_t numTokens; + + // Total number of batches + int32_t numBatches; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Batching information parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // In some cases, some CTAs must early-exit. E.g. when the grid size is set statically, but the + // actual workload is decided at runtime. This element on the device contains the number of CTAs + // that do not early-exit. The number corresponds to the X dim of the grid when the output is not + // transposed (i.e. batchM). To the Y dim, otherwise. + // The size is 1 and the dtype is int32_t. + // Used if isStaticBatch == false, otherwise set to nullptr. + // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. + int32_t const* ptrNumNonExitingCtas{nullptr}; + + // Pointer to total number of padded tokens. + // Computed as + // int32_t totalNumPaddedTokens{0}; + // for (int bi = 0; bi < options.mNumBatches; bi++) { + // totalNumPaddedTokens += batchM ? divUpMul(options.mBatchedM[bi], options.mTileM) + // : divUpMul(options.mBatchedN[bi], options.mTileN); + // } + // The size is 1 and the dtype is int32_t. + // If isStaticBatch == true, ptrTotalNumPaddedTokens should be set to nullptr and + // totalNumPaddedTokens is used. + int32_t const* ptrTotalNumPaddedTokens{nullptr}; + + // Pointer to the map from the CTA index (in X/Y dim) to the batch index. + // Maps CTA index in batch dim (i.e. blockDim.x if batchM, otherwise blockDim.y) + // to batch index. + // E.g. with listM = 128,255,32 and tileM = 128, should be equal to + // ctaIdxXyToBatchIdx = [0, 1, 1, 2] + // If isStaticBatch == true, ptrCtaIdxXyToBatchIdx should be set to nullptr and ctaIdxXyToBatchIdx + // is used. + int32_t const* ptrCtaIdxXyToBatchIdx{nullptr}; + + // Pointer from the CTA index X/Y to the expanded tile index where the expanded tile index is + // computed as: + // + // int expandedIdx = 0; + // for (int bi = 0; bi < batchIdx-1; ++bi) { + // expandIdx = divUpMul(numTokens[bi], TileM/N); + // } + // expandIdx += + // E.g. with numTokens = [128,255,32] and tileM = 128, should be equal to + // ptrCtaIdxXyToMnLimit = [128, 256, 383, 416] + int32_t const* ptrCtaIdxXyToMnLimit{nullptr}; + + // Total number of padded tokens - used as the stride for the activation and C scaling factors. + // Check ptrTotalNumPaddedTokens to see how it is computed. + // If isStaticBatch == true, totalNumPaddedTokens is used, otherwise ptrTotalNumPaddedTokens. + int32_t totalNumPaddedTokens; + + // A map from CTA index X/Y to batch index. + // Check ptrCtaIdxXyToBatchIdx to see how it is computed. + // If isStaticBatch == true, ctaIdxXyToBatchIdx is used, otherwise ptrCtaIdxXyToBatchIdx. + int32_t ctaIdxXyToBatchIdx[MaxNumCtas]; + + // **Expanded** limits for the batched dimension: + // tile * ctaIdxXyToTileIdxMn[ctaIdxXy] -> ctaIdxXyToMnLimit[ctaIdxXy] + // Check ptrCtaIdxXyToMnLimit to see how it is computed. + // If isStaticBatch == true, ctaIdxXyToMnLimit is used, otherwise ptrCtaIdxXyToMnLimit. + int32_t ctaIdxXyToMnLimit[MaxNumCtas]; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // All-reduce parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The rank id of the current device in the multi-gpu space. + int rank; + // The number of peer devices in tensor-parallel group. + int tpGrpSize; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // GatedAct parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // Pointer for partial row max for DeepSeek FP8 recipe. + // This is temporary storage for the row max results. + // If batchM, the shape is [2, totalNumPaddedTokens, N / 128] and the dtype is float. + // Otherwise, the shape is [2, totalNumPaddedTokens, M / 128] and the dtype is float. + float* ptrPartialRowMax{nullptr}; + + // Flags in global memory that sync on "exit" for row max computation. + // The shape is [numTilesM * numTilesN / 2] and the dtype is uint32_t, where + // if batchM, + // numTilesM = divUp(totalNumPaddedTokens, tileM). + // numTilesN = divUp(N, tileN). + // Otherwise, + // numTilesM = divUp(M, tileM). + // numTilesN = divUp(totalNumPaddedTokens, tileN). + // + // The memory must be set to 0 before the kernel launch. + uint32_t* ptrRowMaxCompletionBars{nullptr}; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace batchedGemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index 616383f6a1..f6c8b18092 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -20,6 +20,7 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" #include +#include namespace batchedGemm { @@ -77,6 +78,38 @@ public: } // Returns the offset of the ith chunk + int32_t getChunkOffsetByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getChunkOffset(ii); + } + } + throw std::runtime_error("Name not found: " + name); + } + + // Returns the first chunk reuse flag given chunk name. + int getFirstChunkReuseFlagByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getFirstChunkReuseFlag(ii); + } + } + throw std::runtime_error("Name not found: " + name); + } + + // Function to calculate the total size of the SMEM array + int32_t getTotalSize() const + { + return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); + } + +private: int32_t getChunkOffset(int32_t ii) const { if (mFirstChunkReuse[ii]) @@ -91,12 +124,6 @@ public: return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); } - // Function to calculate the total size of the SMEM array - int32_t getTotalSize() const - { - return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); - } - // Returns the first chunk reuse flag for the ith chunk. int getFirstChunkReuseFlag(int32_t ii) const { @@ -139,9 +166,7 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) { if (mmaKind == tg::MmaKind::Auto) { - std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl; - assert(false); - return -1; + throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); } if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) { @@ -541,14 +566,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(0); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(1); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -562,64 +587,63 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(2); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(5); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(6); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(7); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBias(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(8); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(9); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(10); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -630,28 +654,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(0); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(1); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(2); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(3); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h index a5cb3ab953..f15f246f81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h @@ -181,6 +181,8 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st if (result != CUDA_SUCCESS) { + char const* errorString; + cuGetErrorString(result, &errorString); std::stringstream ss; ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; @@ -283,8 +285,10 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { + char const* errorString; + cuGetErrorString(result, &errorString); std::stringstream ss; - ss << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; + ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json index 212cfb88d6..d54e8a3861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json @@ -12,7 +12,6 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 4, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -30,7 +29,6 @@ "sfLayoutB": "8x4", "sfLayoutC": "8x4", "batch": "N", - "useMetaFp8": false, "numExperts": 128, "useCudaGraph": true }, @@ -46,7 +44,6 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -62,7 +59,6 @@ "gridWaitForPrimaryA": false, "gridWaitForPrimaryB": true, "batch": "N", - "useMetaFp8": false, "numExperts": 128, "useCudaGraph": true }, @@ -97,7 +93,6 @@ "hoistMmaTaskTryWaits": true, "numStagesMma": 4, "batch": "N", - "useMetaFp8": false, "numExperts": 128, "useCudaGraph": true } @@ -107,7 +102,6 @@ "_template": "BatchedGemmFp4LowLatency", "routeAct": false, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": ["bf16", "fp16", "e2m1"], "listN": "8,8", @@ -119,7 +113,6 @@ "_template": "BatchedGemmPerTensorScalingFp8LowLatency", "routeAct": false, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": ["bf16", "fp16", "e4m3"], "listN": "8,8", @@ -131,7 +124,6 @@ "_template": "BatchedGemmDeepSeekFp8LowLatency", "routeAct": false, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": ["bf16", "fp16", "e4m3"], "listN": "8,8", @@ -145,7 +137,6 @@ "routeAct": true, "fusedAct": true, "sfLayoutB": "linear", - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "e2m1", "numTokens": 2, @@ -166,7 +157,6 @@ "_template": "BatchedGemmFp4LowLatency", "routeAct": false, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "bf16", "numTokens": 2, @@ -191,7 +181,6 @@ "_template": "BatchedGemmDeepSeekFp8LowLatency", "routeAct": true, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "e4m3", "numTokens": 2, @@ -212,7 +201,6 @@ "_template": "BatchedGemmDeepSeekFp8LowLatency", "routeAct": false, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "bf16", "numTokens": 2, @@ -233,7 +221,6 @@ "_template": "BatchedGemmPerTensorScalingFp8LowLatency", "routeAct": true, "fusedAct": true, - "useRoutingScalesOnInput": true, "useUnrollLoop2xForMma": [true, false], "dtypeC": "e4m3", "numTokens": 2, @@ -247,7 +234,6 @@ "_template": "BatchedGemmPerTensorScalingFp8LowLatency", "routeAct": false, "fusedAct": false, - "useRoutingScalesOnInput": false, "useUnrollLoop2xForMma": [true, false], "dtypeC": "bf16", "numTokens": 2, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index f80b9172e6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:025a21a07a797de485c2c86bfc5dbbe1fb5c494466346a207e7fc8ac266f60b8 -size 583355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index de97381a21..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d0f7bcd89357e66b7c75ec67ccb481de87d8c73898ee8f81672b0eb35830769 -size 691616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index ed326410aa..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73f79940b8691ecc3a14ce254bd64afced1824a79ac96f2e305dedfebe8f7b5 -size 606299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 5064016786..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7d783dc7ece8cea35d9bf83a7c8eea3986d9f29c3cd67389433d2357ade2d74 -size 729162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 20888a1d49..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4a8773b5462296ac6b1fb65c34e5fec54be4dd41e3b52c50e463c436f616d12 -size 552325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index a609d1ffe2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:afb26c01c09a14d5fd5243d018164e3cb47f6acb26b380810bd24ea20b4deef3 -size 549397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 6c75e50a77..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:19f7b7a44abef9ab58ead44ea82570c2669c41b2bf9dc97300a1cdc732aff6dd -size 688754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 024e1e7caf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd8cf63739a7968321556b7980ce56577f9692921a7f62043295be7e0f9f19eb -size 575499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index dcb2ed07e4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3cc777b009a1fe7fcf4f5c2b794c0eaca2943e8f190c94fc4d3bb30c49348f85 -size 722256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 43a9dab09d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f072615f04e0d0678e70294c0d6b7486e225d2382bad8f8427390a125bb48236 -size 610163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 27af07cab1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:83bb1c43a309a3c0ce1f56cc239e517a7596a4cbdd59c16adec1ca9fb708043c -size 779992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 4cf4506310..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8994ba4c2ca7dd0bf592da450e1404441f358657f7f60c4cdd56b56f9c448131 -size 628668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index da87049c7f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5cdc0f6feff2b0ea40dfb73159dc2a5038a2f46b328d057da200bd76d1fec044 -size 806784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 68b132b2b6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:81396786711ebd423e67d113dc1f1d2d963385b1f56b56d51448070b538678d2 -size 501179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 6d2239f8a0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d82db55cdd10e897a4e3f7778c09c10bf029a69a86a80da92baebe7a6dfcaac -size 632034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index f632f63e44..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c23c02a534806734e5522c54cf9f6ddc112df09c35a0834439796f357ea2c700 -size 518895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 4485f5c7cc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6dca1a3d18d04e8d20b41bea46729faaf36d22e1aec5e75b2dc9d8c074dfca5b -size 661244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 104fa0c8c1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f66cc836ac67fbf37e27c790d93fd56ace7f84e45e1e8e709bc92449f171c40b -size 518249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 54a234683b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:44a920497d1cac7d3c2e27898758f4a069b2e40b16851e1c671a57580a6de613 -size 653742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 27487413df..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5de768877c4368ea3798f6410d5d66cd5010e7debc14fee453941f823cca1d75 -size 535963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 41805ff56f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aed58a7fca6462cd5f1f15181c4818071dd99fe7b436b570132c6cc3df6e97ed -size 682112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 36cfaeacd4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f7294cdde3778c06141ce0ae68e293ec6b9ca990373e452c01f91852cbbb556 -size 554015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 81de55d568..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cbbf9129f822e95d3fae46330460eb5b50878e12c5f3d495102edb04fa7ca69 -size 699918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 8da3ce0b74..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28086a74790bb3e11722786569b95be426bf3ada28a82a36730dd3465d904fa7 -size 571731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index a63d7844f7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92fa64e4f820b2f5d1b4db1864220d36231c2a6451c8a93b5aa13a3cd299caf4 -size 729028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 94c748b3ce..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8f2365a13656aaf10d935265da62b26ff30019fd151296a2c1dea1ee633d1fe -size 539851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp deleted file mode 100644 index 5db3fd96d6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d42abe6ddcac40aaa119421f50038adb4edf9f4e12931ba8528a858bd4de0ca -size 514477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 7859a3975e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b3e59916d41f8e66fa2d474819a031d2a24caaba31485dce0a62d5e12e2d24a -size 629660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 8fc8d104fb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:917ef5d93cf7cfbd76f3d32b3c051f4e119631a0324801a4b8684bf98157857c -size 559193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp deleted file mode 100644 index d5d56c56a8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:330cad5e70b49cf039b5931e9fa6686298bd04d9416e7684aa5f31c4e5e46444 -size 533821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 1ac491c269..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82707942c797cba2d1352e94fa594aca0a8b1c3c07d99646c41227a83f740766 -size 651864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 7f8299ac92..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f7f0619d6293f8448940ba954221fe2c841541bd8199fc943983eb6acd41ed61 -size 420605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 3bbf5ee43b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7882097269f622d56d99f704b42a0331d1fdc59442e803ff9c8f836b67c7d7d2 -size 417333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 8640661908..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34e40aba2b816a6bee4e8f30bc758d7ed3438b290341d6834c7985232f3c4760 -size 504443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 87b13fa745..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fbf07013056fcb9c58a4a90fa2783eb38a3ee311e2ec4d2a5fb7aa855399a389 -size 443893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 0bb069d3f8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15e4735cc2d2e2bf3fdb10698e20303a532939acdfcb6c8c691b2a2d569663a6 -size 440671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index b2669c3acf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a9d9b821d0dcb825bed9911cdd556ed063ba48b398294eb4848159a2d11ae34 -size 526055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 21889830f0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8faa5ad1e00d5e9bfd17f288934536ad440d4655c5efac5e6a1fe01e7be245fa -size 563777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index aecb4e1dd9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb0133260a01b83ed44b6a0b150dbcad2857f575a70476c41da2cca6a9b35f55 -size 736122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 6b74f349ae..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1acd2b5b82fd69f3cf568ff96ace34ac5ed120785e5c5a5b8f45dccb7a589422 -size 591605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 25066e0555..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x256u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1df67c891567c1b074da7b6dae1b3b819bde680cd35b2cf5ab89a750b8cc943a -size 761582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 9a82468a76..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e3bc3b8b35eba8111d94ab7d195b8b731cc9b5f441bda0bf7784e0fdec1cc09 -size 610349 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 539d9762d0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7770d7a52f7ec065e1f3df861c273edf6f7e1cc677c36ac11c0352b50be83649 -size 739970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 2e3d6fc33d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86d84a8dbf978b75d87959e2cbcc50f1863d8f2c771bf26c5c4ad47996f81e18 -size 640150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 226404950d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_persistent_batchN_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:385fa25c09e6dc066957de238fea4f0636ee52cc90f7ad456c61f68390b13aad -size 766516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp deleted file mode 100644 index eda32eb064..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef6738ce1712ecebe6e717e09bd7d78b324f828a4b3f4df2898511275383aad4 -size 600993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 108550c5ee..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4e0e4a92d078fc240e9006ae271cc08a91f214d31cc40269affe86f83b664ec -size 555211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp deleted file mode 100644 index a245c9b0e8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e5b78e9b3db3a45609fba97e9df3c960f5f380e6a59bfddb2423f48908bf51f -size 626404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 7820f4b9a7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E2m1_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6d33c4ccff36bae9207ff93168d3607d5b0681bc586d50b8c6995b409661403 -size 581313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 7e07a1dcc7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d730384eeb2bd8f97d3f0daeabce52e0ea4ed80cac954d9755f51108b0ffce97 -size 841702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 3bae4036eb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20642ded83c712aeeb9ba418353f6fa7552c4622777c61d128253e0e598b85e1 -size 859416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 0114b7a888..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c8805e951d49c58123db07253b867803d3ae30942269c80915e7ea81eeffba66 -size 524185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 093b53806e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15c42189f82f2ea343576b589a1c28748d0ab3af0d04a11d5925f1cd97c4e23e -size 542689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 689d1fcf71..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f5a33ba3b1201b9884329362f5192ab4d394d26b596b5d96f9ab64535f9426b -size 565625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 81365ccdbb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ecbdc8d75681767b45d8f967a7f13dc9c852fa131c274b6c22dac829a0b4a919 -size 582551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 2097d2c376..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b496add3396b71793dbe998f4b1579c1bcae810286df86b0351f12c4197a4697 -size 633262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index bf7f917c8d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:246d7c59b39586b37dd675292254fe90654e5d158ef24aef8ce04d4543c55aed -size 650976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 6a1f9358b2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:868f0fedc4e99546431d08068f43cdd9c3c2f523ed922844b66f4ce83717c402 -size 558219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp deleted file mode 100644 index 5f523f2729..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0463544c6a80fd23dbdc767dfc96f512b2757dec2d976c917a9850a16f9e6f5 -size 527297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 4994666099..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_routeLdgsts_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3fd00a01e561343de9f8bd212b5e6da151e55e68cb12e37aa2ee0edb7cf3d76d -size 575243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp deleted file mode 100644 index 2828d8fb73..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2cb24d3bc83aabe4c4ae6bac627a70fbc5cfc537f917a6c93b8d6830e1e0c373 -size 545849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 07dd143888..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c805a4f096f9f792b6402c607565f2e88fc1a56344e654618c3bbe1b615e01c -size 392855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index fc76a27474..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:08067655e5ad721255bb86ec17617c7217ef9dc1165d1ce4bcc11c5a86dec681 -size 415845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp deleted file mode 100644 index 908071b19a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4cc962ca87e8e42ccf3aab87d2c921cf747ff1cc8d2cb8e9b339491a47122c0 -size 418563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index c453b40432..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:342721dcf5114816a32a15d31e24711a2aadd52ea4eb578695c502324786f163 -size 439973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 0b5e17dbcf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f11f3bc7b0c137577c335e4762068a0de4785a544d85f2575b787360c300f0a8 -size 548601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index af47072c14..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a35b8ab05817e3168b36002a8c0bfa45ff6cbe09d3a2db4af8be4dfb95fb6e6 -size 574701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp deleted file mode 100644 index 4ba5eb66d1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c4f2e7ba6b10a99f386253c6b9315620804590f2feae7dd78351c3fce34d9ec -size 513681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp deleted file mode 100644 index a5ae523b14..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:86179f2a7d2ddbffcb8b40a49f5babc31dc6dd80acff882a67a1eae40d396b24 -size 532233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index b9c02816a7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59566acf5116dd2221842073c5fcea6bcf70eb5ee29b14482e5d4efd33ebadb4 -size 415745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp deleted file mode 100644 index 1cdf0ce039..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41933fab6978bc725dad91ccd0539a25f804f1e579a0360d3d6289eab0e076de -size 439873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp deleted file mode 100644 index f6e7eff70c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf376b0111b28c57f20cdd44a5edaa98edd364da22b194dcfa4188e8247068c7 -size 625426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp deleted file mode 100644 index d0ad1d6f84..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_routeLdgsts_silu_dynamic_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fef76304ee09e3ca32b459db98ef67f3a5cd93e8526c208115227b97eacaf22e -size 659520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6458b602da..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea03bd9dbfb524b5f20de9aa11629b4c56aa23d03162bce221db2823bb227a44 -size 695858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index d8a37c599b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42aeda3c7a2ba5b2506cfd9d38064a2ed4ea72f93758b1f3f11c375461b36c88 -size 575929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ea60080697 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:815661113030c8580cc963c443afc3c82fff7c8f8dd8a0ed98f95a08a91f619a +size 684616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7398382e17 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75593264df3d37f7d23664c02e426196a5f7ee1cc6de76db84895fca4e706c97 +size 562811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a672ca4e29..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc2a320b8589746d4193fab08c40f2221bc31c4d6f0e65db57386d6a38c75a19 -size 718062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0efbcc95ec..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ceb630ea13703b70566398f5421b6fa5c398b72cf415cfcb34479ad59858adf4 -size 598083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6ab10b4e46 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11afdd6753faeffe1446abed54c61ccf0e190ff886fc0270d055a4dbce9a9298 +size 705390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1be82f2378 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e4ecf7abe90d1a7f46a1ff3005cad8ebe0de8e7cf89862cf993f228a98ea00d +size 582747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a2a8e3ab4a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1245c751dbef978b1c4befcf5bd43a8f2c9579d82bda49310b8ff3a5f9b51555 -size 677458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 372d7d526a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce89e844f370baa26823f610f9f4bf92febd85edcf73cfa48aa79a24b2acad55 -size 544207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index dc1d9437d6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cccf1e15a06823668068d7d8c762b9e9152f9b16b00e9f9096ea525c8fd2fc2 -size 541229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ad367f3b91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e921f66e6adc5292b99df5cfbae4a9cbae6182c8e99bbc83ea30bd1ca8ed8f55 +size 667892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fd64e17181 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b6994776b67405b359d471fa5c873efa4714dd71397ddfd82a52d59cbf20a9a +size 550035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1b75d56c4a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04e231d8aefe68dce4ddf7ad71a02d4ab36ee1b22896526a2e3415be9bd2704c -size 710466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8c6d52d14c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec9c13b6822a66c5c2f4be788e25b82e0373728ce65beacc599967a38a3d1dc1 -size 570309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 1af8b6e801..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ebad2f94ecd782e421bb1086268d9d6906e86f119995bbcc8a4da01233c6e65a -size 567379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0a76b6bb5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec5cf07ec8b7a4405c305fb82f9eb7179a4a43ab14a2eacfadc35072b317cfd7 +size 700704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp index 0811ac534f..5d763ed185 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/BatchedGemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x64_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_batchN_dynamic_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:532b00782f06b21405eae6c207cb539494b552d7db05bcecd96bf872d4a586fd -size 578475 +oid sha256:6188f80d9ca7f95ea404a72de95bb8a38cace5dd8a8e76527fd83cf16aaff87d +size 575543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 9504710d98..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1874f3deb9c2f9bcb839bf528265b6ffb471216aba3be6d51e836d93bce639ca -size 737052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8be3ab71e7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:509cffe3503bf6c6f6709a206bb39594497922b52ee412378855340d3e721e74 -size 579381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..49e14cf3f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b725a2ec74293ef928ade1206ebf2e3726f5980bc943f157372a414834d756fd +size 725020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..9f36031dff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b2bd2855cc99dd074f24435265bfc32d0114a2d9e02ff98565c7881095674dc +size 566363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 13bb09a8f6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ae0c7dc73afe57710925d89bb7b0f0ce8370350fb9738c29fd3b040a5be3ce8 -size 758418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 50c2655d58..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5f9d22769d4aacc5ff3c7e917f70534044d9cd4c7f9d92465c595e77af6fcce -size 602325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4e3f4f0a99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2be435473939c1f81601b61b372a37982b0aa0f107cd4778c201d160c4f8e43c +size 745004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c2e9f855cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba80fab864884c31fdca7963a86df70f384e06e5ef76e904971543af60a05c06 +size 586347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ae495e71e7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:477372ea4f8df62dae1a726cbb6568c2254e48c99867c937b97931b6fd7ed4ff -size 680418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ee987184bf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:99e4e49a256529e3c0e9825508900810622d2e23e9f0e918674bd68bce585fe7 -size 547069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 22894dd697..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a17104632fb71fee1a3ea24fcd2d36785d6de547d706c9898782d60648a97e9a -size 544139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e9f6b4a0e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf20f912daa7f6e4580ead6b14f66c3aa0d70d536dfdb509ab06574f8dedcc2 +size 670656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..10a9a79555 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04085e7681246fbf85f61871704ceb68ec39dc3a2ed9a4c3b9855b8da6d6a0f6 +size 552255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8b13d7de7c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b49ec3e3eb755bea7acc9a65357a373820ef380f4eeb33c7312014ba24a9a40 -size 713228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2f1f87f544..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8299ea1b71faf5bf29b281bf34809b42abd2c7897e633100c4c84a0927ae3ba6 -size 573219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 87ee3f07d9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12e260f137725e57e932e277c5c302225ca2558de84149d18cf951e6eb2297f2 -size 570241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..aeaeba607a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d47463f5c2c2711468fde51b407b9e31fc7458cd98c2f2d79181037554591e +size 702972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c8d4342940 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58fb73ed28d0c7c1e0705fcade19faa88e7b310c4b97d0004c84ed52cc275cd1 +size 577763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index aebfdf44ac..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6431fe33358e47eece4cde23298c4b0c38b899dc388a42771f03277a7398dc5f -size 711892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 31f7064eae..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7fc72ddce8dba56b20fbcf6b357a5606fbdea79947dddac55a5f3870021c476 -size 591913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 8fd44e9fea..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a12a6829be0ad78634f64e3953553ea31726c6d7610c048b7a1ebd5699d4e486 -size 588983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0df83c1ed0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed2ac3d4effc40881584f93f4ecf938d0081131cb7b5fa26519be7101d8b0a4 +size 700600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1159769f01 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:516bdac65a439b1ab5615912d573172c38792ab898e68f1b972253588767c398 +size 577315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c2f21b3815..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76b9012ca3a7e2708c87583f06511de344cef82c39de3880c84a06ac4232b24d -size 732270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1c0aa2051d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07be5f7ba072b1dc11d1f997e55ecd2f867d3e34a0ff482293e43da0c2e7b4de -size 613277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 99b5116275..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b796df0502bd1c66514e091efb460a12cff37fb7687da3b227d2aa97b168f02 -size 610299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cd52416b52 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31b8d4b7836e084a9b3f5e8a76d8257f33840162d26b385db5bfcccfd36333fa +size 719648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6b3ff0aedb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10607e6a0e33ca77881111bf04f7e190bab576f1987ecaf38298b96682a3f51d +size 598089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index f00e7ccd98..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eed37e6ff29c4cd8ef47f701177a91c364543c6673973d2aac26d3f04a879792 -size 689988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index fd6a736207..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:012ef5e269cb6b74a1327e77a6167bab0d87b1609acd395904c8b823711c0069 -size 562263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ad9f8b93a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28e437809e9158ec82047a8ff72248fe641c735eb6ebb50984b76fda16df32b2 +size 680966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..25893f8f2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea3a2e628b52e2ab71dee5405da588b8735f5cd2d250dd2b106621c603bc4183 +size 549047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 3229637535..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:631514024f3603cad2f6367284e8ea298dc9d144ce83958488c9968fd069548b -size 722848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 10f61d8be0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ccccdbe8681f605c89e142ed692cfb12daa43843e05879e27a790fab9a59571 -size 588365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5b31abadc4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cebb410d52fda0fef353941e34f1e9b09e152b6d801885d43f39b05ab7feecb +size 713284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..be9e44d083 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01cadc451bde20a1dc1b2905f216468009401f5c230d8e3e172eb7c0e19a73e +size 574557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c79ca7b12f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f5725d34d8bbf7276419c03a5dad57159182b17aa4f8c222bb5e9a4cc5a32f40 -size 691906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a6cfe8b829..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e3dd827a278f44a62caf0dc0e5eb237af98f3eb9c6fda0a70952677861163b68 -size 571433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..654bdb4b14 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8025a48697e5c22bb2cd0d21023d7a554ef64cb8658f24f171c6c249b0104941 +size 680714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..93d0c09eaa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2c747c157bc41de3c7e85f31cb39752451d5809476af277602d64c0b5a6cb27 +size 558317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1677a9cfc1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89c9352232a787bef25c4e83463811b48c6c31ec0cd13f329b58bd9449bd3265 -size 713272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 57f873df85..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3dc5ef98dee4cf3c68ae199d2f3fa73ba92c3ddb79f2989fc79f495d55944563 -size 594377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5218751f5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22751baf77b6f7d97d3914458973a6ca03dfe01e7f9fd84e419903b133d82b16 +size 700746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7a4b66d1bd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa5c83701db0e0adf0f1727f454867a29c941587f1d6be2fa496143b5e768a1 +size 579089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index bdff85b2cb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0da9cb026a49bc75349ab47882e54bbbf942093de625ad354ae987370b913aa -size 672864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6f1167f40d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b17571be698ee3e4a07316107d78eac7e718469727d777107f7e73ef8f61adb -size 540403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index c0fbc68938..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c57d23ab2e12978413d202174749d81bcfb33bb4c98799219829ba44b7680200 -size 537473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..addbcd1565 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70f3e4c712f6c1e7a887771473daea1f4ebdb3febb4ae8e0a8e8045ee867712b +size 663694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..735fb2326c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d4e15ebd2d638df47afc4fcc58ff56ba8032d7352475cdae04c20195795557 +size 546377 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..4c548b23a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a4899f491141f32b13e27af6af56d729abaf5e0454cbd6f1a6a2f409ccb237d +size 542611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 65218052dc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5cbd718178b556738f521e812559f6df74993686323e8a21ea3ba5e9b6a0a79e -size 707154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8438afd183..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3b9c4d266b0f9797dc77187295ae08840436bf1c2287f279877174b64195b7b -size 566553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 86ee12c706..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0e92ba9c81d93736ad08f9f329215531891a129668ad7b1b177b64bdd9e614f -size 563575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..e924aa915b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8185e69e406c9e2cb2533f6c4f17ceba7c938642439634f265acad34cc56af5 +size 696010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6e1be3ef3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f65dfb888be44aa2213e8a8a9dc4ff984e4224135df2e94241bb52ce60c19df +size 571097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..35bb5532e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55285b3eaf9712c22ebc203ec690148dfb971f477710c0fc8139070573f138d8 +size 567331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 5f7fbf0d4a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4bafdf0381cbb64267fa0c71cf4e22db1d5b9feb5b2a94d04d2120f2bb65b4e4 -size 764890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 65a335ba50..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ee1808468b46097cf80f3fad54b9e6106ee598ba6a7c4857f4f41e62499255c -size 598241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 9d72eb3998..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aac10046ceb8a8d1d6cff72d6c0078898edda8bcf2231b890d3d071d14430a16 -size 791682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index f9b61d98d6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eec8c692f414de5e45861e78e6c1ce40626060c21c687205b2baac2c7dde9586 -size 616745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 9b6315ea46..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:206a6c774d5bcf05084a5cb442bd556f2e11a89f1ee6a9750759833d91e53f0c -size 625270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4f15c87944..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56b34ef4c844117a84610af598f5abfd5ea16cb4af599f28d3116551b63b5556 -size 501985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..c7e99a3a1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85420fddbd50cf79cc03a9f7b42957e853967e7995c768b874c38c23c317cd93 +size 621100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cb07bf204e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ef9504c71a596c687518e99a9de172baa0d407acaa438f810cc66b4ab03353 +size 497567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 22dcb89e0d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0056c9290489916c07f2eefc88cf25ce95693862448741215196ef5a98ea339 -size 654430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index cb8a824aaf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1321bfc2eb452dfba8b0f31c1f6f4691d53a9a8d7ece5d126297b089c1a94961 -size 521329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..673479f0ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7d29557f9f4d2c5b259f81c0f26f9df5edc25f4029cbb0b4dbecf3e90e71b46 +size 648286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..74c02a10bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2486fee1a77912dbb3ebba6d4c4f6418c4c1a90013f0f2a323ece5e844f2753 +size 516467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index cc6db79288..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:46ecf18c833ffa14582595ed65017adfc2d2179e61fd3e39c944df13b535f02e -size 366195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index e2b619f5c6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9912ad5d3e034099aa93e2cde478a4ab72e8ef28ec5d40bc74c8d49b173306d8 -size 382133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index a0dbe8dd26..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:437436b0c3c27917a40d1118a773acd685ae81480d88b25d91ee778bedbcb901 -size 407831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 14902b55f8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:deb62a7e353c1f89e1cb9c53ad8c1cc94524d0b872fce86ede6b1a4fb7df9a36 -size 432699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0df88dceb0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:446d8f69fcf91362c872e997c3f307b3e3d29dca8726d70638cb43c9c4092bfd -size 638640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a35e9704c9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3de0e2402039c09b77fb9ab080274c61382e5e55004d20d9a1dc0dda80b5e9a1 -size 500901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..60a4705f69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e36aef6743d6ddd96ddb08418626fe5b3e4b83f663957386dec718d1325f8f +size 629388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6099b2ccd9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36846461c9cee973c8e43123da81b8ce68512821cdb013bd3ff7e2b47cc4a736 +size 507681 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 50e2f181b9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:992b266f3acc77d0aa0bc644c5c68887a7e40d341628e1aedf13a6606f113a3a -size 667010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 3852bad46d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1a1f9ea4ddb9be65a055910649c51ecc25e477039849c243b90edf94be0f6b4 -size 520243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..dd379b98e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba884722cb487cfa74838af9106919249ae4d0069cd7a76593f91644558f18d8 +size 654896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..62a08c6f2b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fc47e97fff712a63a4837aaa4dfe2edbd4f8b3b6d0621d6478d9862b51156cb +size 523915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 394affc312..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e33b7a8589c3cb37b7cdfdce4ad228c14ed93a215728f663ac4bd06cebcf3bd6 -size 380255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 6cdd36e1e7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ef937ca40ce223f76e80b6fbf0e437b5dcb9aaafaf3855a3a18e69a2365005c -size 395453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 8343998d17..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:25ef530b28d7c02b6ef8c2478fdf93f3dae65898683a1ef5ef3437685ce9cbe6 -size 407387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 1218f9d99e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c6060bad1fae17a6871f5f61b3628d498c709950516b64b5132377f6702fe5a -size 431515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ddc03ddcb2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6fc7bb87912da176c04623efed8f0341a04980e2204666067df7d7876e2d22a -size 684816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a65460a3f1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89a314cd5c4737d647ae2d8399c37a11e096f3e64e99e160f8049c5dde8b5361 -size 536865 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..6ff1e45633 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b965067f5d7bc01c722dd20399eb842a21e313040a47d5745bf6b305635ee6 +size 667276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..380089acb7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d6eb31c54ac57087ac4d1223fe88c62084b376ba9dfcbfc35032be1e179a2cd +size 543595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index ce47433e3a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d707d8c8e4af55733d9ddce971a0b1d960b1cbb8074b6c4923c3e05e1045fea -size 713926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index b617486701..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9b647e32a5fb73a1e9d6c4c9d047be6f1b30b716ec91b2e9f9a46b7cb056186 -size 556207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..18e1bbfc02 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce665b7b39d4a3207eda877552a07913cb8e4cb7e53a9ed8833dcb550b45171f +size 699790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..861a696dc7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a024d6f97d8ca153bf21e7f0a8d510003a944fe2ef7c6c75c5645997acfe4e97 +size 560767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index af31095e58..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7daa186be00334dfa38647a1a8ce7d01b5f4f79ce6d8236030d7ecdb14f82714 -size 380995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index dbd3299b21..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:639114a86500ab030885dfa07b21358d0399b4caf669363cc06c038bc3a8187a -size 397771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2db13ab016..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0235bcd6171392bbb6f3f3c5c45781a8f1d0f409688b912b9e4ec3913db038f4 -size 620084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e927b79466..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9551e3a8dad7fcd5033b69c69e25f2b0844b51da4ce6fe73c9fd70061490f8d0 -size 532221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index c76875d68c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7a222b68f86f740ed073a7145af7facc8de9947454096e340dba08202409162 -size 506105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..343c5c8761 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7318cc054638cd294e5c66c7e58921bc2832f8fd862108dee360e9f9023ca0 +size 500899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..a74bc8075e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14fec35af5f02b0e83c0fa75432ada6f733043ca2590084efa57d035dbb9ac0d +size 614925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4b63464bf4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36055fc1a963c09ebc4164c4c2190f408f273183898d85e708c2c0b1c559c22f +size 527063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index fc8be6e089..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c7e3894b4703fdbb1742a32099a1b3a5da604fd11696aa4e2413b7de4872a29 -size 647370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4d579763f4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22a861de1625ab33fad8fbcee284f4b703276da0de0d2054ab54e77ed7c231ca -size 550035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 4581763968..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1dc70875c381547127bd67500448967face47e03767512fc656e9efd67cc373c -size 523919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..f03fe89a4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c86d51a8b981965e5a82df05052b0f56ebdb2f9709393f7305237837e30b22 +size 518909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..042e648572 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:912c750c31ac443dc14b74bb80c61314c3535d7edcfe47a4426d55245c6adb16 +size 640830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2e7b8ffbba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccca9bf30a12e0e7fe0f9d4f075ecca308b22bc3c52b633b98a9db5b2c87bcc4 +size 545221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6dbbf8ffae..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a5581506c96efe8b3de780abd27fd16bf6ed4919b7299cbc66a45a2242cc2007 -size 489343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 924f219f91..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:080c437196f7e14f625dd9390268c7eeae4d10ac221c5ec95c16197616ad0a61 -size 408683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 97e5a0a696..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bcf16e3fc742b0a64edaf01e5ae83e01242c524bb090c6f58b96b1b6ba5bb2e -size 404619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..07ef9af81e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18997fb7464d75fbddd5b974ebce6e947296216ec315098c67863fb18662c2f1 +size 419789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f44bc59d7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b71db6a57bdbe221e12aef06842bdc966d9ccedc9e068c49aa3c4b3c74c6d4c +size 352153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..24a68e306b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b59ffcbf4c1a6b4825182d589835b97318ab99fc2b76082154a0dfbe5208d4 +size 348089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8f62c204e2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ce822a9497b1879f2e9b64b19facee26b631b8c2c5ac9a56240dfff1e039f0d -size 510955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a4c777e11d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f85435556c41e775b48afd49b409e01e04de854a1c94206daa5669e243f9e1c -size 431971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index f32aae2779..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c618f664f3518c1905a2ae8818de844233fcea013c8d4ce745a70523c2c32fd2 -size 428747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4bc15d718a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce226338e7a2675b25a2a899020d1b796e7c0cd325acc92d6ea510ad932ab3b +size 441203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7f6afaa068 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c0555fa7853afe618da47f3c85a6a30969765c3dc15678f5a917e564617ccd +size 374849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0528275bb6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:445be6bb2d548ccbad24cb54f9d7bf6bd623330c1097f2b722ac74a5f72f2d3b +size 371625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2755c748f2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:53994ea4d8e793eb76e83fdb136742f2c0ad544b643a92c72c3f7bc75616190f -size 721020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 51d6302216..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ac9f582441c5b6fc61a2b11b96c989dea8514baf19aa20b83bc69b79d50e77e -size 551855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index dab2b984da..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:922cb705df2d15cbc735935036f5a4c013be2664b0658b3b80b8807bfd324cdb -size 746480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2cd0196b00..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ffcaaf98a0690e195f4db052de1beed5137cafec2b6bfa67e6237675d4f0622 -size 579683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index feb752a8c9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb157f5d516bf61e88af32ccd8d02e04ff5882bf3dec504918c290b4041f9a04 -size 724868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 82cb1dd31d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dbf68ac9fd77c43fbdc4ba2b371c22f6c2b81687002804bc8b130194e085348e -size 598427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 091fb35299..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4d7ce3a66154e065984bfd99038d607c939e838812d879734a794675df91ee4 -size 751414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index aa4c73bf6e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:893eae3bfcd2628e148cd1517d8409df2b609da270e0655a1119a4abe9aacf3e -size 628476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 70fc2cf4cb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c7002241c783cf7840a93c12d9d4e61638960a5027a35e7be15137fc41df9674 -size 724492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 6a70a56334..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a50e4d398403c8a46213e4be0e89dd9fce8cd48e530243f429bb408c896c852 -size 614769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2f473889a8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9ce3c0958d0919ffdfdf77928ab3301f1518daf1ed9ea23e9e7f276af1b7854 -size 619362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index bcc2eef210..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d087ad1b4a2c83063e69646d1bb0480555d263ef4a2249bc175e81e78dfa38fe -size 551779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..35e8c484d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7fda865674570a1fd46c6a1819c2142b2e56cd64d0495c268930cd85265d223 +size 730664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..8fa8b26fc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49e9f3da47698eb56eba8de6a47b68468a741a56c956208b6ac5d83e466d643b +size 629976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 49b1769a62..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f1413c9a0029ba988aa8bf992733c4be49428bbecd0901a47f169e3f2620e2b -size 756958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a4162074e7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56ccca4d91cebfa0afa996a88267c85f2f1b0cc0a0c5ce35a771dba7f7a23a4b -size 640182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e197b783fa..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c8c39520ff16a3a044611ce701410149e0cb45ea9e604c6b3fe939b0eaa3988 -size 644822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 00afda7d5b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af1d97bb2fb4a1e29d796eacff6895823875a52d241fd11bc8bbfb2cd05e58c4 -size 577879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bc0df6954f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b14877b81b30776f826b7a2614bdb29833f56e6ac84e2fa29744316e83172d0d +size 763722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..aa48341042 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff57f1df32fd6a8e27822c6a61941a2bb058b93d961b031ad603f9ce9ba605ed +size 654152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a975b31945..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52d4626310ccf0ec623f7582013510af961fbc347225a1d0d276e9cca2c445b5 -size 741216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index be549857a1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d2871c8e692ef6cfe3dd31294e103e0bc1dee9bb404505e8ec010c6c7a2d9893 -size 642052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index f78656c79a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b75aad55bebfea0fa1c7b61f6a8a984830a83c3b440d3b6652c8a8ed65ef59b -size 647482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 74a6a92c72..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7f557b67ecb31849480e5230d62341870624d07a2a67f53e3917e09ae613e2b2 -size 563963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..52b1f95ca5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:117af8887a3e305a1b5dd133fa56c79e7c2a65a21cb99baf779f369539865582 +size 747388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f252d655f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:862a03ca3ffdd48548a2ef0cc6d222900768b00a70d2b36579880c5fe6a3a383 +size 657996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0bbed40ec4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c261069ffdbc1375674e3ba344f06040124d48b42030d0c563fa6fe6f4946dd1 -size 775902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a83a9ba952..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2883088bd2bf442155ace498d91e03ff744733fe42d4764a3dad1b01011da1ec -size 668894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index eafbfee741..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1fe2c29144c867c64f3d3bfb6535b64be7f5eca34f07eba20d97a08407aa8151 -size 673534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 97f942c5f3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b892131f9f896cd94d8893bc739140e5d5fd9c6ad1b56dcf9e848cdc4b19193 -size 590903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0667d42897 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1261768b637531bea3520e8c2763b981607f330d937034c7b6fa33777ab79d8 +size 782814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cc19dc3a7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:817ed44a8b2e61ebf0e0a9336b05dcfcec4e29e8173df95f2a9986cc982c8a0d +size 683062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a144084f7a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:649bad79d3a3ef2a07e3076636c0e44dba5cfee9170a716e5127bdd50b9bfb54 -size 827994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7ac299648f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a212b236833536450d11a323d1ba2f6a0de1c148bf42efc558f0f89e3f40ffd -size 722218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2a72627142..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:663f67df48a36fbb8d1805c060305b4c4afe420ef963ca3d9bfa255b24cb9fee -size 730856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 4d5be3cf7a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cab2a0187322f206dd66cfe177ad8721e482f0b3830055a9c5f5772de4beb19e -size 626716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..bf5d600b67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333e7b6120bd40c68a252d28c06cc66877e3cb0a129b7876437e61207aea2d9f +size 833870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3121fb0ccc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46335a677f7d15101f872b47ba27f23ef9df7e2a079991b61706ac3bd78a9ee9 +size 730960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 835e181a8b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32d37324c92b18f3b6a11bb599ff65bbb79e35a09c5f2a1fc48fb456d113fc73 -size 848570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7a09f092d9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1068b16d3275458179c521436ed37ea11b0efd84f436172b5660484f0ccc23d6 -size 724590 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 84d8a4d273..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de61d46a04ad6d156430a7627079f0f0d755879a3166d6c45bc3044ff7ec5153 -size 753800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 54c3640664..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5626bf2578f7e4903bf8862de2b1094d76c40e152df81df4582d46de8c009c2b -size 650252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4d3e7f2908 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69ec7ee2099a8fd07fdf480c320e3d0d1f49722e759befd8f3a9e04134797c97 +size 854298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..b5b23663c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476a5e853bba875db6b46a78ac89ae65997add5e50370b18e32d46998a68042c +size 752868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c5fe730ae3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6868a616c6c80a14e94927d0ff572dd498736f39bece8f4cec2d5aafaffe7524 -size 732182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 945d67b47b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c3bbde2a7475474d1bfc94a38d714c539d2daf6a31871dfa92e157eeeb092fe2 -size 587483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8c3b376874..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3b257c59b4d94424c66ed5b1aa0e86e94f1e3a1e64e29d50d5838bfaabaeea52 -size 591285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 53a321868e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6473d848df0043076877dff72c9c06d221c48014d0a1e3c2c72b4f46f2ead706 -size 542497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..3aa88e676d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dac191341e0c2ace0a7bb836dab5c086d58939174951d2bde5af74843dda86b +size 738946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..cc029f6877 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4c653abf56693b3b4c24c11242b24f384b628d7820c9b5e1c3001067f8747ee +size 601207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..c58b8c46d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40f9c39d8f6608456ba13de259673d08f9eb678151d5ff8a716b30c63a4c2a00 +size 548423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 09a381e0e8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d809f43141a1c58fe709f332d0c12dc8b9aed15e6720f2a43ca586d1e756841 -size 756656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 813489b153..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc1b72ca7482f0a36b6e72d4ebffe6856dee179c73b83613d1c98f686b163649 -size 612893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 3070ecf70a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b9ca29575ab123ce1226133862ed446c88c3d77dbf6f4a9f58119395b8224152 -size 615955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 0346cd1633..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:916d4539cb36d0fb2889bed4baa98f85625f9ecc045241c2ec7710ea6a2b4a85 -size 569389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..fe4d02ba43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbea5715d6a2b792fd9e9c38653f387a53d71fe412ae446254cebdf4fc50ab8 +size 763222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..60f241ea64 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9a8b178f6a88f0264ce1dc036afe230dffad101a027113c1e106d0ef195500 +size 626174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..2a15a9413c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5782e26e596b32e13a8fad28e8fe58b8cd27a3cd1dbec10ad0fcd8c94bb766f +size 573933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 8397ebe352..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f67506520434f4e097d45d449f41089cbb9582590295f8f2fba48e5d43996fa -size 830558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a2176f002f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8dd50b429b3cc85bec59d8536f640d14f7d5087c2faf7b544f34426daf92e213 -size 848274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index a9156dd24f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dacd5573cf56150d2ceb3ee45b0a27923a1e2c8b2d9e39bf5a9091659bc46b74 -size 651720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index b6da8682e9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa12e1966b64c05566bc207dea56dc6eaf0cc5922e404808db991eb93bc4ea94 -size 530803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..63edc3f7f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12ea1127bc6caba48d3c6bcc3a6737d5f62d9690fb9c6f3c3fda376d2d60b141 +size 655590 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..81d51ca5f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf38ae8b4a37ba781712ecafc76f2d7ea12cf2858c9315be4c17b67300de6532 +size 529197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 264d13bf1f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2bfd72cd0cc811395e58d4c5577b2687e2815b5125303c553d79d7cc116c9372 -size 679004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 1156035e9f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6063c8c0ee972d29dbb6eeefc8ebb3a37a6d301a89507a168e816b5d19221e83 -size 549257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..7197910fda --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1908f9653078b018436b7911aae221359ca4438fb305eed8ab4c94af935ec6 +size 683072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..97e9ec7d93 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58fe371417dce440f8502cb609083eabe611beb4584cb561fb89c55468794a94 +size 548145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index a4e119de23..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e8e6df206384194b7b46eb080105b9ed15368e269a0372e4af6c12c2e4ca0176 -size 365595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 10d0c9d340..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:73666c2256da1ecb702db857d33d203cab8ad3fb98fb64601a3d749a4fdb145c -size 380743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 3af5773abb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd0fdb84b3a80209a0ebecb97e9068209f03669e00e6366936ca3b99ca20532d -size 407231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 07d93f2b0b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dab96d2635e9c953e5236c5409ec42ee5607bf9128652e329efd35de57431816 -size 430521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index d1be987f79..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae04c8071da9caa74dd33184768d5c93f99f8f4f12593388251e5b01d3aef35b -size 669676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index cbc6cd183c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d3390af0e8c73d8e48646dc56030daa635e1b2588c5fdd2b691c457ca80bf6c -size 549055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..4889eca703 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6509222d39a698e1411f9a829fede1cf78a0fd924964e1ab4889df5accf24abe +size 675224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ba3e8d95e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f65079590fd9eb664f40b98542c1612a7b1396852ea5f783c59108b44ff80a51 +size 553813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 13ea3e1ed9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:37403d7c0eb5cef186d8e025a3510f20e840091443f7a00496d7e412df649ff6 -size 693016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 92b045b548..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3507ecec80cfb8a114451b8aa89c30c60de2983a1edef31b2e0ccfc4dd69a696 -size 567511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..5812226dbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41a31bf0ea7db5fa45e592c73416998c04e6da558c0b1d7496973d775b9d28f +size 703694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..89c67a92e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3660a7406d501d1dc0ac7fee6d18c510433a1fac8764f27f8efee84d2a12fdf8 +size 571973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 74155f14b8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26fdadd279ab2caa94a8ad5e9e600f9a55948d6f3bf573137d92a06058cf4881 -size 379063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 68105a25ef..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1832e94682cb81a982bcb5bdcd77f49bf46b4ab73195e1d9e844adaa3feba501 -size 395051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 38f3b7047d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dd8ae599d6eac10c4e5098b509b72cb4e4fa369d024632099d22fb8ab71acfe2 -size 406195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 2c0e02d286..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcb2b6fbaf3dc4b5b35b1988f090f91ea45a9cd0fe9b9cbbea282b4a6344584a -size 430323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 2dfe7ecd4b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5df661a449db5d64dd5f6af8b5242ccf7c5e7342c6a8e4336f9b745252d2be53 -size 749548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index c6ba536b9e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b1dff66a755d21396361d246a0ddfb3dad20ad0d758f078cb8add5ad45d21d7 -size 617481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..95b8365e30 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02653b7787a6d0c0e31d3b39a356228de6def37091fc08d61b2d95d61dd504a2 +size 756624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..0cc0d5622f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5a1b4800e0147a8819d0712ce767f8a1ed8baaa807e04bec0b04b93cb55334 +size 621254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index d038bb4772..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:434e6cf958146538c89fcfaf1456edd63bcf4b984e6d22bb10fe928b34a00e34 -size 773626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e4ad82c374..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:390a8115df0918b8de3f6e8ce248872a4c736bfc32176b1b42f4f2c1d53030be -size 635936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..89ddcd85dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04bc350db6e92b32532591e6a7f4817af6bebda9dfa136a004a7f26e49ca002d +size 787658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..127faf4ba1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:913d6f95d37a62baa493ecf88929cca8c665b7b4052312d70afa0fcba9bf8443 +size 638622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index ccd04219bf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:323199b2b81e80b2e94aac52775e50797f788ae53911d2cd58c847a5bbc705c1 -size 381775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index c82a83eeab..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d3d73eb9b3ca9d274cbe6035f832893f5ff7750855bc35f683e6dc12ba8ba3d -size 396975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 7df91d20dc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e87c061378b8fdb436d393499029fabbc6c40996a56dd7bb48d3d3961bfa70b -size 629760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e35cf34ff3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:76aafb70a84732f0ccb5d9fecc0204900479577c6a883e06571ca7cf264590bc -size 549789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 5c60876def..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb90e03b8aedcfab1ac4296eabedebbb5a488373532ff5043cbece0e615b597d -size 518085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..abcf50c9ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc1f829768f394a0c3e3f55344095f585668a09352653769734a5962d558387f +size 512927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..f2f72fb9b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6d5ee131b5218cee8bc4bb8eaecd7127c8de9311b38e81bde5a51d8b19cd0c8 +size 636492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..66719b426c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ef5916b9e88b7fd7d58ba9844c804cc572d242365425c639160fd22a32b4c4 +size 549663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 0b0170df9d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0dc4c7adcb8df3507b2d2865fa74527b58553bdab67d679b6913398e798b0ec1 -size 656058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 4432003d42..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fde975d0fe7ab059003187c19e1b8094ae56d3d9d65e57cb2bd7d6ee394a77c0 -size 567701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index edea3269a3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82aa8cc831e1269b4a5033f31b2e960c15ab57668c6af7f2c17d40eda8a4d863 -size 536689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..9cae415dd0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cfcb8ebb10fdf8ede8d85465cc4667a1d1eec3157749ae9b78b8c4c2aad399 +size 530939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..349c94aa3f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c04e8203a73f55ae51acd11cc2180b1a33f107933f6c84cf8a199caf28bbd55 +size 663184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..1411ab91ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a38882e74f64e637d69c47d8a971625eb669bbbb0a2e7ed8521cda9bac5596c +size 566491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index e24550c159..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da83e0b39f9f9ceaea27ffec23a7557fd2a6621a2adbc1dc4eccf5bd07907dc7 -size 381713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index dc5eaacafc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d0b1774bd95182aa6a731e6798078a73677b8fc6ba33d9b025e081ff681fe529 -size 384727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index db92cda15b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05a575f87c641bb26bf21e707e2eeba08fea3ec76e00dbd19ba123e1bb0cb88a -size 403921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..2e270fc1b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:932b1f3765f40aa1751e80e065ca8548f87f4a9d641d962b40b4ded3551bd35e +size 362729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..c5b7be4b91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c5cad59c87a548f7d7a0a688158c9c86793a09aa44d986eeac49450b2775f48 +size 347391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index be7b21c366..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:03202e4fbae5edd9360eef780d93fa841ecd20261face21183a901ffde595793 -size 407419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 509615c9da..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:141a042926d2ac42a3ab9b4ad135025532a1545321bcac9350562938ea5524e3 -size 410433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 37d70d4fb3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:40bcb9addd9e9a9aae26fdfe3fd1d7961e559c334ed77dec7f02fd68a83c2b5a -size 427259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp new file mode 100644 index 0000000000..ee9a432fc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45e4bec76b0d5e8e78e8756660a27a89e073b4af4843ff01b9867808c3e93004 +size 385575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0b45ae94f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ef760f50f0ec4426fa7bc7ecaa6f07415f68647ad061b6970d472c0f4e3d4b7 +size 370137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 647583d6d3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b318dd86dd9c0ed9937fadae5a04f350a2d14404408ee9f45829921b10788644 -size 539643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 17ae579c28..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d7d3677a4f24779b57045d45aee9630bd590faf41d56495cd85941fe9b2beff -size 566583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index b99a8a0a1e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a58e3d02a90ae81158bf2d1c86ad02c258196e7493918131d1fbb674f7a4611a -size 542553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index c84d727004..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b59df55bff46a883fe0d4e7263f3cf7ac5530521ee1f4de6267be7b60ad4705 -size 569443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 2c206db6a3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79ad8ccff3f47abe6b10c2f3eabe24285e4c571d689f46ad6c573450648bca1d -size 587397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 2b86b98f05..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:118ca6081063bb9cd81d241fbc2695a56ac0a569b85794fa64cf6c9521e01f0f -size 608713 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index a53f400d9a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00f27e49f222111c649e5b0a27631a9a003afedb39b8295ef239dda685b35804 -size 535887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..6da407574c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5789ddc8bece759e54f7e53a06eedd34d24a59cc0e216d2ef8ce21a31ae1a95e +size 541813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 06d4d2ebe4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8369de7e02e7dc95016ff1ca0d8591813b7659c7b7e64676cc27d6d15a1ade6a -size 561989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..2a7c966d75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8425ed46a527766e2b20a59db17be8aa8511a7d03022685c489b3d1d1450118 +size 566533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 09b3cb2689..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ef65e2943df41260afd8386a0139d4d5157dc863e6c37b0712aea1aec6ce9162 -size 365397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 1bf8badf7c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0606ef52d5cc169522b2e590d1e8fccf13c6e42cfef3d0b1e616a3c0ba99eb3 -size 380547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 6b60fb75b7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:055aff6ad86abc19f0be98a1b456dae3e0f22720f4b32a9e6e6397b7201b85b6 -size 407035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 4233995531..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x16x512u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:157ed300e3b05c48bcbc7117058966b141b1361c8c2d6993a105efcd0fdc6672 -size 431113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index b49cb30c5c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27754873994b8c574b0fac242c29246e926f0fbe6a246abf46849a40340f5201 -size 378667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 459f102b30..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c9c469d4e8d06f2e5808a780a0edd1093e513081193f7e0079417ec252773f1 -size 394655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 2c3353f034..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b573ce8a01a7daa855ac15496a91cf367643f066051e5d83e6085b6a8b7891d -size 406591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 037d338822..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x32x512u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d6defb2593eebf6f6836b00390fd58d72c691de6316aef902a73592cffcde897 -size 429929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 9b2e21f0a0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd40a5857ab80791717ba59955ff025a7a786bea319d1006fd7ffd03f869eb77 -size 380197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index a1a339cb2c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c3911391b24e870bbb10c21ac545300b032bba9f20f7de25a3de48f1502383c -size 396185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index d7eae74331..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61071c9df9d91c699bbc89120f1311f95444c69a36bd5816c98a6cc31d56b1ac -size 504519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..f3d6284577 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b381989464b296ed6bbeb5cc30b83d1e736e9ad521f6b37e3362cf6be4a880f1 +size 500101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index fcf27d4478..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6df36d9769a300dba73f5346aba238310994393596f894da80bc7af0d800409 -size 523911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..dabdb1e60a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2916365c16d42aa950ee87cd0e11b8e8a4d0573714a6bf7e6412ca6cf058be25 +size 517323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index 97893b6162..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7c2a0218ed7875dd9d3020837e4307c64f28ac94e605e2d92765f5ff8bca4ba6 -size 403821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..0aecadc579 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa34a5fd253b0014ddc93425ed5ca30349cc1e88a8f0492dc679f96b065236d0 +size 347291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp deleted file mode 100644 index a0153d9e1f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2b955fc5be3e831b0b22921912f4973b51f9defb236508a1455d2406b64504ca -size 427161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp new file mode 100644 index 0000000000..a95515865a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83d02d90f012c3c2aec1959f38a7607add5c4492be4c300522a122b6dd64a0e5 +size 370039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 555a6c34e2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d2fbb9786415e98a360a2c856fb7cdcd87fc6f2682922d7bf42513ae2f4d0c8 -size 611915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp deleted file mode 100644 index 222e67316c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_silu_dynBatch_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5330d761614ff56b0dfd60a3b814347901be32a1eb5612e7dd97039ba4ae8d03 -size 646008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp index 761fb475de..744294f34b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp @@ -113,8 +113,8 @@ void TrtllmGenGemmRunner::run(int32_t m, int32_t n, int32_t k, void const* a, fl // FIXME once we start using all-reduce in the epilogue of the gemm this can be moved elsewhere gemm.runInitBeforeWorldSync(config, gemmData, static_cast(stream)); - auto const err = gemm.run( - config, workspace, gemmData, static_cast(stream), multiProcessorCount, globalTrtllmGenGemmModuleCache); + auto const err = gemm.run(config, workspace, gemmData, static_cast(stream), multiProcessorCount, + /*usePdl=*/true, globalTrtllmGenGemmModuleCache); TLLM_CHECK_WITH_INFO(err == 0, "Error occurred when running GEMM!"); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h index 0ff3334a3e..adae51a36d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/Enums.h @@ -39,6 +39,31 @@ enum class AllReduceAlgo : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class MatrixLayout +{ + // K-major layout (default). [Mn, K] + MajorK = 0, + // M-major for A and N-major for B. [K, Mn] + MajorMn, + // Layout is blocked along the K dimension as seen in the diagram below. [K / blockK, Mn, blockK] + // where blockK is fixed at 128B + // + // ├────────────── K ──────────────┤ + // ┬ ┬ ├──── K block ───┤ + // │ │ │ 0 1 2 3 ║ 32 33 34 35 │ + // │ CTA0 │ 4 5 6 7 ║ 36 37 38 39 │ + // │ │ │ 8 9 10 11 ║ 40 41 42 43 │ + // │ ┴ │ 12 13 14 15 ║ 44 45 46 47 │ + // M ┬ ├────────────────║────────────────┤ + // │ │ │ 16 17 18 19 ║ 48 49 50 51 │ + // │ CTA1 │ 20 21 22 23 ║ 52 53 54 55 │ + // │ │ │ 24 25 26 27 ║ 56 57 58 59 │ + // ┴ ┴ │ 28 29 30 31 ║ 60 61 62 63 │ + BlockMajorK +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class SplitK : uint32_t { // No split-k is needed. I.e. mNumSlicesForSplitK == 1. @@ -54,6 +79,20 @@ enum class SplitK : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class BiasType : uint32_t +{ + // No bias. + None = 0, + // One bias value per N of the output tensor. + M = 1, + // One bias value per row M of the output tensor. + N = 2, + // One bias value for each element of the output tensor. + Mn = 3, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class TileScheduler { // Static scheduler (Non-persistent). @@ -80,6 +119,23 @@ SPLIT_K_FUNCTION(Dsmem) //////////////////////////////////////////////////////////////////////////////////////////////////// +// Helper functions to check the Bias type. + +#define BIAS_TYPE_FUNCTION(Mode) \ + inline bool isBiasType##Mode(BiasType type) \ + { \ + return (type == BiasType::Mode); \ + } + +BIAS_TYPE_FUNCTION(None) +BIAS_TYPE_FUNCTION(N) +BIAS_TYPE_FUNCTION(M) +BIAS_TYPE_FUNCTION(Mn) + +#undef BIAS_TYPE_FUNCTION + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gemm } // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h index 459d831e0b..0e7b7c13cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmInterface.h @@ -63,8 +63,10 @@ struct GemmData { // The matrix A. The data type is controlled by options.mDtypeA. // - // When transposeMatrixA is false, the shape is [M, K]. - // Otherwise, the shape is [K, M]. + // When layoutA is MatrixLayout::MajorK, the shape is [M, K]. + // When LayoutA is MatrixLayout::MajorMn, the shape is [K, M]. + // When LayoutA is MatrixLayout::BlockMajorK, the shape is [K / blockK, M, blockK] where blockK + // is 128B. // The rightmost dimension is contiguous in memory. void const* mPtrA{nullptr}; @@ -100,8 +102,10 @@ struct GemmData // The matrix B. The data type is controlled by options.mDtypeB. // - // When transposeMatrixB is true, the shape is [N, K]. - // Otherwise, the shape is [K, N]. + // When layoutB is MatrixLayout::MajorK, the shape is [N, K]. + // When layoutB is MatrixLayout::MajorMn, the shape is [K, N]. + // When layoutB is MatrixLayout::BlockMajorK, the shape is [K / blockK, N, blockK] where blockK + // is 128B. // The rightmost dimension is contiguous in memory. void const* mPtrB{nullptr}; @@ -142,6 +146,21 @@ struct GemmData // The shape is [N] void const* mPtrPerTokenSfB{nullptr}; + // The bias applied after the GEMM. + // The bias is applied before applying the global scaling factor. I.e. + // C' = (A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // if BiasType is N, the shape is [N]. + // The bias is broadcasted along the M dimension. + // + // if BiasType is M, the shape is [M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* mPtrBias{nullptr}; + // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. @@ -230,7 +249,7 @@ public: // Launch the cubin from the provided config. It calls all necessary memsets for internal buffers. // Provided config must be validated with isValidConfig before the call. int32_t run(GemmConfig const& config, void* workspace, GemmData const& options, void* cudaStream, - int32_t multiProcessorCount, + int32_t multiProcessorCount, bool usePdl = true, std::optional> moduleCache = std::nullopt) const; // Initializes the buffers before the world sync. Must be called before run. @@ -388,8 +407,11 @@ bool GemmInterface::isValidConfig(GemmConfig const& config, GemmData const& data //////////////////////////////////////////////////////////////////////////////////////////////////// int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData const& data, void* cudaStream, - int32_t multiProcessorCount, std::optional> moduleCache) const + int32_t multiProcessorCount, bool usePdl, std::optional> moduleCache) const { + // Might be used. + (void) usePdl; + (void) moduleCache; // Get options from config and data. auto options = getOptionsFromConfigAndData(config, data); @@ -417,15 +439,14 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c int numTilesN = gemm::divUp(options.mN, options.mTileN); // Create kernel params. - auto kernelParams = gemm::KernelParams::setKernelParams(options, data.mInputBuffers.mPtrA, + auto kernelParams = gemm::KernelParamsSetup::setKernelParams(options, data.mInputBuffers.mPtrA, data.mInputBuffers.mPtrSfA, data.mInputBuffers.mPtrPerTokenSfA, data.mInputBuffers.mPtrB, - data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mOutputBuffers.mPtrC, - data.mOutputBuffers.mPtrSfC, data.mOutputBuffers.mPtrMultiMemC, (float*) data.mInputBuffers.mPtrScaleC, - dSplitKSlices, data.mAllReduceBuffers.mPtrTileBars, data.mAllReduceBuffers.mPtrMultiMemTileBars, - data.mAllReduceBuffers.mPtrCompletionBars, data.mAllReduceBuffers.mPtrMultiMemCompletionBars, - dPtrSplitKCompletionBars, + data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mInputBuffers.mPtrBias, + data.mOutputBuffers.mPtrC, data.mOutputBuffers.mPtrSfC, data.mOutputBuffers.mPtrMultiMemC, + (float*) data.mInputBuffers.mPtrScaleC, dSplitKSlices, data.mAllReduceBuffers.mPtrTileBars, + data.mAllReduceBuffers.mPtrMultiMemTileBars, data.mAllReduceBuffers.mPtrCompletionBars, + data.mAllReduceBuffers.mPtrMultiMemCompletionBars, dPtrSplitKCompletionBars, /* dPtrNumNonExitingCtas */ nullptr, data.mProblemDimensions.mRank, data.mProblemDimensions.mWorldSize); - // The size of the grid. std::vector grid{numTilesM, numTilesN, options.mNumSlicesForSplitK}; @@ -443,26 +464,26 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c #ifdef TLLM_GEN_EXPORT_INTERFACE CUmodule cuModule; CUfunction cuFunction; + if (moduleCache.has_value()) { ModuleCache& moduleCacheRef = moduleCache.value().get(); - // Modules are associated with a specific context so include the ctxId in the key + // Modules are associated with a specific context, so the context is included in the key CUcontext ctx; unsigned long long ctxId; cuCtxGetCurrent(&ctx); cuCtxGetId(ctx, &ctxId); - // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal - // representation. + // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a + // string in decimal representation. std::string const ctxName = std::string(reinterpret_cast(&ctxId), sizeof(unsigned long long) / sizeof(char)); std::string const funcName = std::string(config.mFunctionName); - // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision auto const moduleKey = ctxName + funcName; auto module = moduleCacheRef.find(moduleKey); - // Check if module exists in cache. Otherwise, load it + // Use cache if module is found, otherwise load and insert into cache if (module != moduleCacheRef.end()) { cuFunction = std::get<1>(module->second); @@ -492,17 +513,18 @@ int32_t GemmInterface::run(GemmConfig const& config, void* workspace, GemmData c // Run the kernel. auto result = trtllm::gen::launchKernel((void*) &kernelParams, cudaStream, config.mSharedMemSize, cuFunction, block3, grid3, cluster3, - config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA - | config.mOptions.mGridWaitForPrimaryB); - if (result != CUDA_SUCCESS) - { - return -1; - } + usePdl + && (config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA + | config.mOptions.mGridWaitForPrimaryB)); // If a module cache has not been given, unload the module to avoid leaking if (!moduleCache.has_value()) { cuModuleUnload(cuModule); } + if (result != CUDA_SUCCESS) + { + return -1; + } #else config.mCudaRunner->run((void*) &kernelParams, (void*) cudaStream, grid); #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h index 8ab241fc6c..1e3abbfeef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/GemmOptions.h @@ -91,20 +91,23 @@ struct GemmOptions GemmOptions() = default; - GemmOptions(AllReduceAlgo allReduceAlgo, int clusterDimX, int clusterDimY, int clusterDimZ, tg::Dtype dtypeAcc, - tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool enablesEarlyExit, bool enablesDelayedEarlyExit, - bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, - bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, - bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, - KernelTraits kernelTraits, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, - int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, - int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool useShuffledMatrixA, - bool sliceK, SplitK splitK, bool transposeMatrixA, bool transposeMatrixB, bool transposeMmaOutput, int tileM, + GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, + int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, + int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, + bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, + bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, + MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, + bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, + int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, + bool patchF2fp, bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM, int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, - tg::SfLayout sfLayoutC, TileScheduler tileScheduler) + tg::SfLayout sfLayoutC, int sfReshapeFactor, TileScheduler tileScheduler) : mAllReduceAlgo{allReduceAlgo} + , mBiasType{biasType} + , mBlockK(blockK) , mClusterDimX{clusterDimX} , mClusterDimY{clusterDimY} , mClusterDimZ{clusterDimZ} @@ -112,6 +115,8 @@ struct GemmOptions , mDtypeA{dtypeA} , mDtypeB{dtypeB} , mDtypeC{dtypeC} + , mDtypeMmaA{dtypeMmaA} + , mDtypeMmaB{dtypeMmaB} , mEnablesEarlyExit{enablesEarlyExit} , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} @@ -128,6 +133,8 @@ struct GemmOptions , mHoistMmaTaskTryWaits{hoistMmaTaskTryWaits} , mK{k} , mKernelTraits{kernelTraits} + , mLayoutA{layoutA} + , mLayoutB{layoutB} , mM{m} , mMmaK{mmaK} , mMmaKind{mmaKind} @@ -143,11 +150,10 @@ struct GemmOptions , mNumStagesMmaAcrossWorkTile{numStagesMmaAcrossWorkTile} , mNumStagesWorkId{numStagesWorkId} , mOutputDebugTensors{outputDebugTensors} + , mPatchF2fp{patchF2fp} , mUseShuffledMatrixA{useShuffledMatrixA} , mSliceK{sliceK} , mSplitK{splitK} - , mTransposeMatrixA{transposeMatrixA} - , mTransposeMatrixB{transposeMatrixB} , mTransposeMmaOutput{transposeMmaOutput} , mTileM{tileM} , mTileN{tileN} @@ -164,13 +170,17 @@ struct GemmOptions , mSfLayoutA{sfLayoutA} , mSfLayoutB{sfLayoutB} , mSfLayoutC{sfLayoutC} + , mSfReshapeFactor{sfReshapeFactor} , mTileScheduler{tileScheduler} { } // The all-reduce algorithm. AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None}; - + // The type of bias. + BiasType mBiasType{BiasType::None}; + // Block size in the K dimension + int mBlockK{-1}; // Cluster size in X dim. int mClusterDimX{1}; // Cluster size in Y dim. @@ -185,6 +195,10 @@ struct GemmOptions tg::Dtype mDtypeB{tg::Dtype::Void}; // Data type of the outputs. tg::Dtype mDtypeC{tg::Dtype::Void}; + // Data type of the A matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaA{tg::Dtype::Void}; + // Data type of the B matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaB{tg::Dtype::Void}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; // Whether to enable delayed early exit to overlap @@ -225,6 +239,10 @@ struct GemmOptions int mK{16 * 16}; // Traits of the kernel. KernelTraits mKernelTraits{}; + // Layout of A matrix + MatrixLayout mLayoutA{MatrixLayout::MajorK}; + // Layout of B matrix + MatrixLayout mLayoutB{MatrixLayout::MajorK}; // The M dimension of GEMM. int mM{128 * 2}; // Size of the MMA instruction in the K dimension. @@ -259,16 +277,14 @@ struct GemmOptions int mNumStagesWorkId{3}; // Whether to output debug tensors. bool mOutputDebugTensors{false}; + // Patch float conversions. + bool mPatchF2fp{false}; // Reorder rows/cols in the A matrix for the better memory accesses in the M-major epilogue. bool mUseShuffledMatrixA{false}; // Slice-K implementation to use TileM dimension for TileK. bool mSliceK{false}; // The location of the exchange for split-K (it's None when split-K is disabled). SplitK mSplitK{SplitK::None}; - // Is A matrix in a transposed layout? M major if true, K major otherwise - bool mTransposeMatrixA{false}; - // Is B matrix in a transposed layout? K major if true, N major otherwise - bool mTransposeMatrixB{true}; // Save output of MMA in M-major format. bool mTransposeMmaOutput{false}; // M tile dimension of GEMM. @@ -303,6 +319,12 @@ struct GemmOptions tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; // Scale factors layout for C. tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; + // Number of "repeats", i.e. reshaping factor, to fold hidden dimension into SfBlock dimension. + // As result, the hidden dimension of the SF tensor must be a multiple of NumRepeats * + // numEltsPerSf * 4. This reduces the problem shape space that the kernel is able to run. + // But it reduces the number of L2 requests under the hood and potentially improves perf. + // Applies to layout 8x4 only. + int mSfReshapeFactor{1}; // Tile scheduler type. TileScheduler mTileScheduler{TileScheduler::Static}; }; @@ -332,6 +354,7 @@ struct GemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -373,6 +396,10 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mAllReduceAlgo=" << "gemm::AllReduceAlgo(" << static_cast(options.mAllReduceAlgo) << ")" << "," << std::endl; + ss << "mBiasType=" + << "gemm::BiasType(" << static_cast(options.mBiasType) << ")" + << "," << std::endl; + ss << "mBlockK=" << options.mBlockK << "," << std::endl; ss << "mClusterDimX=" << options.mClusterDimX << "," << std::endl; ss << "mClusterDimY=" << options.mClusterDimY << "," << std::endl; ss << "mClusterDimZ=" << options.mClusterDimZ << "," << std::endl; @@ -388,6 +415,12 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mDtypeC=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeC) << ")" << "," << std::endl; + ss << "mDtypeMmaA=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaA) << ")" + << "," << std::endl; + ss << "mDtypeMmaB=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" + << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; @@ -405,6 +438,10 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mK=" << options.mK << "," << std::endl; ss << "mKernelTraits={}" << "," << std::endl; + ss << "mLayoutA=gemm::MatrixLayout(" << static_cast(options.mLayoutA) << ")" + << "," << std::endl; + ss << "mLayoutB=gemm::MatrixLayout(" << static_cast(options.mLayoutB) << ")" + << "," << std::endl; ss << "mM=" << options.mM << "," << std::endl; ss << "mMmaK=" << options.mMmaK << "," << std::endl; ss << "mMmaKind=" @@ -422,13 +459,12 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mNumStagesMmaAcrossWorkTile=" << options.mNumStagesMmaAcrossWorkTile << "," << std::endl; ss << "mNumStagesWorkId=" << options.mNumStagesWorkId << "," << std::endl; ss << "mOutputDebugTensors=" << options.mOutputDebugTensors << "," << std::endl; + ss << "mPatchF2fp=" << options.mPatchF2fp << "," << std::endl; ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl; ss << "mSliceK=" << options.mSliceK << "," << std::endl; ss << "mSplitK=" << "gemm::SplitK(" << static_cast(options.mSplitK) << ")" << "," << std::endl; - ss << "mTransposeMatrixA=" << options.mTransposeMatrixA << "," << std::endl; - ss << "mTransposeMatrixB=" << options.mTransposeMatrixB << "," << std::endl; ss << "mTransposeMmaOutput=" << options.mTransposeMmaOutput << "," << std::endl; ss << "mTileM=" << options.mTileM << "," << std::endl; ss << "mTileN=" << options.mTileN << "," << std::endl; @@ -451,6 +487,7 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mSfLayoutC=" << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" << "," << std::endl; + ss << "mSfReshapeFactor=" << options.mSfReshapeFactor << "," << std::endl; ss << "mTileScheduler=" << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" << std::endl; return ss.str(); @@ -490,6 +527,7 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) inline bool checkAndUpdateGemmOptions( GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) { + if (options.mDtypeB == tg::Dtype::Void) { if (updateOptions) @@ -502,39 +540,98 @@ inline bool checkAndUpdateGemmOptions( } } + // If not specified, used the input dtypes as MMA dtypes (no cast required). + if (options.mDtypeMmaA == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaA = options.mDtypeA; + } + else + { + return false; + } + } + if (options.mDtypeMmaB == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaB = options.mDtypeB; + } + else + { + return false; + } + } + + // Check that the A cast is supported. + // Currently, we only support {MxFp4, NvFp4} -> Bf16. + TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) + || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) + && options.mDtypeMmaA == tg::Dtype::Bfloat16) + || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), + "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); + + // Check that the B cast is supported. + // Currently, we only support Fp8 -> MxFp8. + // TODO: add same support for A (no transpose) + TLLM_CHECK_ERROR((options.mDtypeB == options.mDtypeMmaB) + || (options.mDtypeB == tg::Dtype::E4m3 && options.mDtypeMmaB == tg::Dtype::MxE4m3), + "Unsupported cast for B: ", tg::dtypeToString(options.mDtypeB), " -> ", tg::dtypeToString(options.mDtypeMmaB)); + + if (options.mDtypeA != options.mDtypeMmaA) + { + TLLM_CHECK_ERROR(options.mTileM == 128, "TileM must be 128 when casting the input matrix A before the MMA."); + } + + if (options.mPatchF2fp) + { + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE2m1 && options.mDtypeMmaA == tg::Dtype::Bfloat16, + "PatchF2fp is only supported for MxFp4 to Bf16 casts."); + } + // FIXME: We do not support different dtypes for A and B when not on Blackwell. if (!isBlackwell) { - TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For non-Blackwell, A and B must have the same dtype."); + TLLM_CHECK_ERROR( + options.mDtypeMmaA == options.mDtypeMmaB, "For non-Blackwell, A and B must have the same dtype."); } // Check that the different dtypes for A and B are supported by the tensor core // kind::f8f6f4 - if (options.mDtypeA == tg::Dtype::E4m3 || options.mDtypeA == tg::Dtype::E2m1) + if (options.mDtypeMmaA == tg::Dtype::E4m3 || options.mDtypeMmaA == tg::Dtype::E2m1) { - TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::E4m3 || options.mDtypeB == tg::Dtype::E2m1, - "For E4m3/E2m1 A, B must also be E4m3/E2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1, + "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1."); } // kind::mxf8f6f4 - if (options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1) + if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1, - "For dtypeA = MxE4m3 or MxE2m1, dtypeB must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1, + "For dtypeMmaA = MxE4m3 or MxE2m1, dtypeMmaB must also be MxE4m3 or MxE2m1."); } - if (options.mDtypeB == tg::Dtype::MxE4m3 || options.mDtypeB == tg::Dtype::MxE2m1) + if (options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1) { - TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE4m3 || options.mDtypeA == tg::Dtype::MxE2m1, - "For dtypeB = MxE4m3 or MxE2m1, dtypeA must also be MxE4m3 or MxE2m1."); + TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1, + "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1."); } // kind::f16 - if (options.mDtypeA == tg::Dtype::Fp16 || options.mDtypeA == tg::Dtype::Bfloat16) + if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16) { - TLLM_CHECK_ERROR(options.mDtypeB == options.mDtypeA, "For Fp16/Bfloat16 A, B must be the same type as A."); + TLLM_CHECK_ERROR(options.mDtypeMmaB == options.mDtypeMmaA, + "For dtypeMmaA = Fp16/Bfloat16, dtypeMmaB must be the same as dtypeMmaA."); } - // When different dtype are used for A and B, we must use different tile to do the loading. + // When one of the inputs needs to be cast, we must use two load warps. + if ((options.mDtypeMmaA != options.mDtypeA || options.mDtypeMmaB != options.mDtypeB) + && !options.mUseTwoTmaLoadWarps) + { + TLLM_LOG_WARNING("Two TMA load warps must be enabled if any of the inputs needs to be cast."); + } + + // When different dtypes are used for A and B, we must use different tiles to do the loading. // It is not strictly required, but current implementation of SmemAb requires that. if (options.mDtypeA != options.mDtypeB) { @@ -547,7 +644,7 @@ inline bool checkAndUpdateGemmOptions( { if (updateOptions) { - options.mMmaKind = dtypeGetMmaKind(options.mDtypeA, options.mDtypeB); + options.mMmaKind = dtypeGetMmaKind(options.mDtypeMmaA, options.mDtypeMmaB); } else { @@ -555,11 +652,6 @@ inline bool checkAndUpdateGemmOptions( } } - if (options.mMmaKind == tg::MmaKind::Fp16) - { - TLLM_CHECK_ERROR(options.mDtypeA == options.mDtypeB, "For Fp16 MMA, A and B must have the same dtype."); - } - if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) && options.mMmaK != 32) { @@ -626,9 +718,6 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); - TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); - int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32; if (options.mMmaK != mmaK) { @@ -646,21 +735,56 @@ inline bool checkAndUpdateGemmOptions( } } + // The MMA N may only be smaller than 64 if it is equal to the tile N. + TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, + ") must be >= 64 or equal to TileN (", options.mTileN, ")"); + } + if (tg::dtypeIsBlockFmt(options.mDtypeA)) + { + int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); + TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); + auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; + TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, + ") must be a multiple of 4"); + } + if (tg::dtypeIsBlockFmt(options.mDtypeB)) + { + TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 + || options.mSfLayoutB == tg::SfLayout::Linear, + "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + // TileN must be a multiple of the number of rows per SF tile. int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsB == 0, "TileN (", options.mTileN, ") must be a multiple of ", numSfTileRowsB, " for B SF layout ", tg::sfLayoutToString(options.mSfLayoutB)); - // The MMA N may only be smaller than 64 if it is equal to the tile N. - TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, - ") must be >= 64 or equal to TileN (", options.mTileN, ")"); - int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfB) == 0, "TileK (", options.mTileK, ") must be a multiple of ", (4 * numEltsPerSfB), " for typeB ", gemm::toString(options.mDtypeB)); + auto const numEltsPerSfBInK = options.mK / numEltsPerSfB; + TLLM_CHECK_ERROR(numEltsPerSfBInK % 4 == 0, "K dimension of scaling factors for B (", numEltsPerSfBInK, + ") must be a multiple of 4"); } + + int32_t padMultiplierA = 1; + int32_t padMultiplierB = 1; + if (options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) + { + if (options.mDtypeA == tg::Dtype::MxE2m1) + { + padMultiplierA = 2; + } + if (options.mDtypeB == tg::Dtype::MxE2m1) + { + padMultiplierB = 2; + } + } + TLLM_CHECK_ERROR((padMultiplierA * tg::dtypeGetNumBits(options.mDtypeA) * options.mK / 8) % 16 == 0, + "K dimension of A must be aligned to 16 bytes."); + TLLM_CHECK_ERROR((padMultiplierB * tg::dtypeGetNumBits(options.mDtypeB) * options.mK / 8) % 16 == 0, + "K dimension of B must be aligned to 16 bytes."); + if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); @@ -668,8 +792,10 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mSfLayoutC == tg::SfLayout::R128c4 || options.mSfLayoutC == tg::SfLayout::R8c4, "Only the 128x4 and 8x4 SF layouts are supported for C."); int const numSfTileRowsC = options.mSfLayoutC == tg::SfLayout::R128c4 ? 128 : 8; - TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsC == 0, "TileN (", options.mTileN, ") must be a multiple of ", - numSfTileRowsC, " for C SF layout ", tg::sfLayoutToString(options.mSfLayoutC)); + int const tileTokenDim = options.mTransposeMmaOutput ? options.mTileN : options.mTileM; + TLLM_CHECK_ERROR_FMT(tileTokenDim % numSfTileRowsC == 0, + "Tile%s (%d) must be a multiple of %d for C SF layout %s", options.mTransposeMmaOutput ? "N" : "M", + tileTokenDim, numSfTileRowsC, tg::sfLayoutToString(options.mSfLayoutC).c_str()); int const hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC); @@ -753,7 +879,6 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mM > 0 && options.mN > 0 && options.mK > 0, "M, N and K must be larger than 0"); TLLM_CHECK_ERROR(options.mNumSlicesForSplitK > 0, "Split K must be larger than 0."); - TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); if (options.mUseShuffledMatrixA) { @@ -911,6 +1036,11 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR( options.mNumStagesMmaWithinWorkTile == 1, "Non-DeepSeekFp8 requires numStagesMmaWithinWorkTile == 1"); + if (options.mNumStagesMma > 1) + { + TLLM_CHECK_ERROR(options.mTileScheduler == TileScheduler::Persistent, + "Non-DeepSeekFp8 requires persistent scheduler when using numStagesMma >1"); + } } if (options.mUseDeepSeekFp8) { @@ -923,6 +1053,7 @@ inline bool checkAndUpdateGemmOptions( // Check that TileK = 128 for correct scaling of every 128 channels. TLLM_CHECK_ERROR(options.mTileK == 128, "Tile-K must be equal to 128 for DeepSeek Fp8"); + TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); // Tile sizes of the output hidden dimension. auto hiddenDimPerOutputTile = options.mTransposeMmaOutput ? options.mTileM : options.mTileN; auto hiddenDimPerEpilogueTile = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; @@ -997,14 +1128,22 @@ inline bool checkAndUpdateGemmOptions( if (options.mUseUnrollLoop2xForMma) { - bool notSupported = (options.mK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; - // Check that the 2*TileK is a multiple of MmaK when UnrollLoop2x is enabled. - // This is to avoid deadlock when mma runs even-numbered loop while the other warps run - // odd-numbered loop. + // Number of iterations in K dimension after padding. + // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations. + // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is + // + // ceil(512 / (128*3)) * (128*3) = 768 + // + int paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK); + // Check that the padded K (K rounded to next multiple of tileK) is a multiple of 2*TileK when + // UnrollLoop2x is enabled. This is to avoid deadlock when mma runs even-numbered loop while the + // other warps run odd-numbered loop. + // + bool notSupported = (paddedK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; if (notSupported) { TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=", options.mTileK, - " and K=", options.mK, " and numSlicesForSplitK=", options.mNumSlicesForSplitK, + " and K=", options.mK, " (paddedK=", paddedK, ") and numSlicesForSplitK=", options.mNumSlicesForSplitK, ". Disabling unrollLoop2xForMma."); if (updateOptions) { @@ -1059,43 +1198,108 @@ inline bool checkAndUpdateGemmOptions( // // Kernel 1: ----PREEXIT-----------FLUSH // Kernel 2: -------PREEXIT----ACQBULK---FLUSH - // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible ----------------------- - // Warp 1: ---- (!) We normally assume that 1 is visible is not yet visible- - // Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible ---------- + // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible + // ----------------------- + // Warp 1: ---- (!) We normally assume that 1 is visible is not yet + // visible- Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible + // ---------- TLLM_CHECK_ERROR((options.mGridWaitForPrimaryA || !options.mGridTriggerSecondaryA), "A: If a task triggers a secondary kernel, it must also wait for primary kernel."); TLLM_CHECK_ERROR((options.mGridWaitForPrimaryB || !options.mGridTriggerSecondaryB), "B: If a task triggers a secondary kernel, it must also wait for primary kernel."); + if (options.mUsePerTokenSfA || options.mUsePerTokenSfB) + { + // Checks applicable to both MetaFP8 and RoutingScalesOnInput + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "DeepSeek FP8 and per-token scaling are not compatible"); + TLLM_CHECK_ERROR(isBlackwell, "Per-token scaling is not supported for Hopper"); + if (options.mUsePerTokenSfA && options.mUsePerTokenSfB) + { + // MetaFP8 case + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, + "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), + " dtypeB=", tg::dtypeToString(options.mDtypeB)); + } + else + { + // RoutingScalesOnInput case + TLLM_CHECK_ERROR((options.mUsePerTokenSfA && !options.mTransposeMmaOutput) + || (options.mUsePerTokenSfB && options.mTransposeMmaOutput), + "In RoutingScalesOnInput mode, perToken scales must be used on activations"); + } + } + // The generation should support non K-major layouts for both A and B; however, it is unclear if // there is a use-case - TLLM_CHECK_ERROR(!options.mTransposeMatrixA || options.mTransposeMatrixB, - "TransposeA true and TransposeB false is not supported"); + TLLM_CHECK_ERROR((options.mLayoutA == MatrixLayout::MajorK) || (options.mLayoutB == MatrixLayout::MajorK), + "At least one matrix must be in k-major layout"); // Some features are currently only support when both matrices are in K-major format - if (options.mTransposeMatrixA || !options.mTransposeMatrixB) + if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) { TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell"); TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K"); } - if (options.mTransposeMatrixA) + if (options.mLayoutA == MatrixLayout::MajorMn) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeA) >= 8, "Subbyte types only support K major layout"); } - if (!options.mTransposeMatrixB) + if (options.mLayoutB == MatrixLayout::MajorMn) { TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeB) >= 8, "Subbyte types only support K major layout"); } + if ((options.mLayoutA == MatrixLayout::BlockMajorK) || (options.mLayoutB == MatrixLayout::BlockMajorK)) + { + bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; + + // Block K size must be 128B. + // TODO Leaving this as an option for now in case we want to expertiment with other block sizes + // As the user is not expected to set this, do not fail if updateOptions is false + int32_t const elemSizeInBits + = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); + int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; + + if (options.mBlockK != elemsIn128B) + { + if (updateOptions) + { + options.mBlockK = elemsIn128B; + } + else + { + return false; + } + } + + if (options.mBlockK > options.mTileK) + { + TLLM_CHECK_ERROR(options.mBlockK % options.mTileK == 0, + "If block size is greater than tile size, block size must be a multiple of tile size"); + } + else if (options.mBlockK < options.mTileK) + { + TLLM_CHECK_ERROR(options.mTileK % options.mBlockK == 0, + "If tile size is greater than block size, tile size must be a multiple of block size"); + } + } + + if (!isBiasTypeNone(options.mBiasType)) + { + TLLM_CHECK_ERROR(!isBiasTypeMn(options.mBiasType), "BiasType::Mn is not supported"); + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "Bias is not supported for DeepSeek Fp8"); + TLLM_CHECK_ERROR(!(options.mUsePerTokenSfA && options.mUsePerTokenSfB), "Bias is not supported for Meta Fp8"); + } + if (updateOptions) { // Init kernel traits. options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc, - options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, options.mEpilogueTileM, - options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, options.mNumSlicesForSplitK, - options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, options.mTransposeMmaOutput, - options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8, - options.mUsePerTokenSfA, options.mUsePerTokenSfB); + options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, + options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, + options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, + options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, + options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType); } return true; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h index 5d55ff418b..7a748fefae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelMetaInfo.h @@ -28,113 +28,115 @@ namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "744dc79e" -#define TLLM_GEN_EXPORT_VERSION "6.0" +#define TLLM_GEN_COMMIT "32110ebf-dirty" +#define TLLM_GEN_EXPORT_VERSION "7.0" static constexpr size_t tllmGenGemmListLen = 46; #ifndef EXCLUDE_SM_100 -extern unsigned char GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin[]; -extern unsigned char GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin[]; +extern unsigned char Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; +extern unsigned char Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len; -extern unsigned int GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len; +extern unsigned int Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; +extern unsigned int Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const gemm::GemmConfig tllmGenGemmList[] = { #ifndef EXCLUDE_SM_100 -{GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 150528, "gemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "67bad780d8f03b24804e34cc5317720c13949c72009f311e9d17a1cd6b10819a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -142,6 +144,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -158,6 +162,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -173,11 +179,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -194,9 +199,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "f71c31c377d83567ce6db02f270354c85d9cab4e8543726c4f0322121eea617c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -204,6 +212,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -220,6 +230,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -235,11 +247,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -256,9 +267,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "257908af538388410f3ec0d3e5108e288a5c77d2f9b02383618d5fb08002ee51", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -266,6 +280,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -282,6 +298,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -297,11 +315,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -318,9 +335,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 168960, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 168960, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "550d553ade1407a25f2520568149e07b978f150afef43118397e6aa90111bd9b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -328,6 +348,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -344,6 +366,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -359,11 +383,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -380,9 +403,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "de2f28d823b6eb726debee79d8733d8656821252accd929d3158cace2b6a845a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -390,6 +416,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -406,6 +434,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -421,11 +451,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -442,9 +471,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "e103cc3353cc9f652e99cbf588ba053ba3b914596dd93c87b25f6e7f1225b5cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -452,6 +484,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -468,6 +502,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -483,11 +519,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -504,9 +539,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "a75f88e0c9791b7f53e6c6e764784015b47b2f0a97b22896823a15cc64f69e5a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -514,6 +552,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -530,6 +570,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -545,11 +587,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -566,9 +607,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "dca5f0c1dcb998e0baa7db6c4a023d5a06e19de4dbb9ce9edb6dd9edc89bb431", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -576,6 +620,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -592,6 +638,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -607,11 +655,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -628,9 +675,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "ba2d2e2e05b540cf1f56d35c87287a255f1d2043e8970acaf57c928bb5ece183", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -638,6 +688,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -654,6 +706,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -669,11 +723,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -690,9 +743,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "3128c19b37c22de699fc92e22cbee5d4b05c28dfb85418044ce1eee6f3e9744f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -700,6 +756,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -716,6 +774,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -731,11 +791,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -752,9 +811,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "34d0f58dd43e9428e983e7b8bc8c1d703ee9a0cc46f73cb0bf99c40f7de542df", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -762,6 +824,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -778,6 +842,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -793,11 +859,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -814,9 +879,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "007f0ffe31b6104e71385d9f7f378e7974fd942fb6f528eb3ac28b387eed6338", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -824,6 +892,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -840,6 +910,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -855,11 +927,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -876,9 +947,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 217088, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 217088, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "cd112e5364c7b204daf52b09f6fc23d37f7292fa14f970b050905c53cd71e487", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -886,6 +960,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -902,6 +978,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -917,11 +995,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -938,9 +1015,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "cb469548e2f1507579cf58c9cd864472a38203939a81cecd43376c8086839601", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -948,6 +1028,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -964,6 +1046,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -979,11 +1063,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1000,9 +1083,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 225280, "gemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 225280, "gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "6aad723e3a1f1267f892edbc89a7b95e7058daaae6e65c7a1d8a81968b42df58", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1010,6 +1096,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1026,6 +1114,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -1041,11 +1131,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1062,9 +1151,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 158720, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "fd16bb4cb68c22a7d6135c131cf9458e054960de0adad06a596aca06a0d4f723", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1072,6 +1164,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1088,6 +1182,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1103,11 +1199,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1124,9 +1219,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 158720, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 158720, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "92f81034b485c2b6724cb83002fdcddb1d102a21b69e265c851263198b16d15b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1134,6 +1232,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1150,6 +1250,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1165,11 +1267,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1186,9 +1287,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 218112, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 218112, "gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "0b5cf51c225dd33ce5348a23c87bce062df590938af2547a8408e6bddf563bb5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1196,6 +1300,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1212,6 +1318,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1227,11 +1335,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1248,9 +1355,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 82944, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 82944, "gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "35b5bd8342a4ae619b7a441f48e5c96afe7047c4c05e8e8a2fed2023699008bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1258,6 +1368,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1274,6 +1386,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1289,11 +1403,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1310,9 +1423,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 82944, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 82944, "gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "995b3f8d3be717372fceccd1bcbf9d811a191685ae50208f29f9779b0f1c20e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1320,6 +1436,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1336,6 +1454,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1351,11 +1471,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -1372,9 +1491,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 93184, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 93184, "gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "4ae45f02a96abc1fee68d5d482590733206bb21974d7540b244c72599d42b029", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1382,6 +1504,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1398,6 +1522,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1413,11 +1539,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -1434,9 +1559,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 93184, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 93184, "gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "eb325f16f32466cef919bc05f3a08c49ca6ca7d4fb02cb87086a33b3c2893ae6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1444,6 +1572,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1460,6 +1590,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1475,11 +1607,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -1496,9 +1627,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 115712, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 115712, "gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "e0cde71b5344cd24a607f59f690775b198c2dbe8caf1194b82ebfcfe0e7d22a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1506,6 +1640,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1522,6 +1658,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1537,11 +1675,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -1558,9 +1695,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 115712, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 115712, "gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "f9407c6f8fba4f59f4f8568987b1d29a542fc3076e8cdff5cb80e74d9f4ddcf0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1568,6 +1708,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1584,6 +1726,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1599,11 +1743,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -1620,9 +1763,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "2cfc6cace0893b2a2866d06b24146d77b9c2568d371b9e41337f99f020ddf6e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1630,6 +1776,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1646,6 +1794,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1661,11 +1811,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1682,9 +1831,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "67414166d1a8a94e40a6f6a2f7d24e99f7634eeaacd015098f212368bd3bc5bc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1692,6 +1844,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1708,6 +1862,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1723,11 +1879,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1744,9 +1899,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 216064, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 216064, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "99dbd99a0e95a841e9416a1099a40fd4e2b42f1a44fd0352ac331b307cee14f4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1754,6 +1912,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1770,6 +1930,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1785,11 +1947,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1806,9 +1967,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "5b1f3d6e3705a32cc257f0566b57dba0ae89c90aae749090ea864e3ac1e152d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -1816,6 +1980,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1832,6 +1998,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1847,11 +2015,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -1868,9 +2035,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 150528, "gemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 150528, "gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "294b977b50865e7fbe41ef9d006e1912856c96b4e15a7588e24d783e044d0929", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1878,6 +2048,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1894,6 +2066,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -1909,11 +2083,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1930,9 +2103,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "d05d164e79e213a9b04ce518ced20ec9faad7967ad951cff14a9e60ef47a7047", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -1940,6 +2116,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1956,6 +2134,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -1971,11 +2151,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -1992,9 +2171,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 175104, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 175104, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "d2975334a8914f64992a06417b40d42b1c25e8a57dff639bdb8b0768faea4037", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2002,6 +2184,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2018,6 +2202,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2033,11 +2219,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2054,9 +2239,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 168960, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 168960, "gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a", 224, "277bd69ed7d198f9fc8ab87c7d9df3f74762dcc48a845711fa881ef1a345c03d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2064,6 +2252,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2080,6 +2270,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2095,11 +2287,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2116,9 +2307,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "749b06758d88b8c6c9233bef06af59f5c717e767d0ab7636726a6f5808aebec9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2126,6 +2320,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2142,6 +2338,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2157,11 +2355,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -2178,9 +2375,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 84992, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 84992, "gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "4f107da93c4869724202d13aa7d6f69a618ec18367ed1f434efb804e9e2950cb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2188,6 +2388,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2204,6 +2406,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2219,11 +2423,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 16 @@ -2240,9 +2443,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "05b668ebaed4847a3ed92ce43d78442fd006621261f095ddb36ba150ed2d4ad9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2250,6 +2456,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2266,6 +2474,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2281,11 +2491,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2302,9 +2511,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 97280, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 97280, "gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "c2924dd87f53e3decedf433f205b742ae84cef4a77d7862441591ed0203b91d4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2312,6 +2524,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2328,6 +2542,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2343,11 +2559,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 32 @@ -2364,9 +2579,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "3b6e8407873ded09c5e433d5aa3e4ec0323ba895d13aea285e3a92ce1836046f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2374,6 +2592,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2390,6 +2610,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2405,11 +2627,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2426,9 +2647,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 123904, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 123904, "gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "6a647bb4f09f60d57a88d8b744bf772a261b3ccc3a5706b4ef0396438ef19cba", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2436,6 +2660,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2452,6 +2678,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2467,11 +2695,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 64 @@ -2488,9 +2715,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "98c0347b249a8f8c1e3a891f1532d0f3851ce781028f9d92c0dd18bf9705fd81", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2498,6 +2728,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2514,6 +2746,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2529,11 +2763,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2550,9 +2783,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin_len, 78848, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a", 416, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin_len, 78848, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a", 416, "103144508c3b2e8d01fcb40ec5980f67b2bdd5da4687635039f007e5ac798546", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2560,6 +2796,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2576,6 +2814,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 1 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2591,11 +2831,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2612,9 +2851,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 217088, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 217088, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 224, "47a6adb00497e6864f5f3dd1eb9326de21daddb02576d665951f268568598a9d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2622,6 +2864,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2638,6 +2882,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2653,11 +2899,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2674,9 +2919,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin_len, 215040, "gemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a", 224, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin_len, 215040, "gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a", 224, "0e025e6540a9eaf9e9d84635fcfcc4d63d563288d1210667a446c150eaf44620", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 2 @@ -2684,6 +2932,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(1050629) , /* mDtypeB */ trtllm::gen::Dtype(1050629) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2700,6 +2950,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(2) @@ -2715,11 +2967,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2736,9 +2987,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 225280, "gemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 225280, "gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "acd6bcbe32966a092b0241457c15142a729a33e40ab4a3d5f9e5ada9d0ca80b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2746,6 +3000,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1052679) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2762,6 +3018,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -2777,11 +3035,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2798,9 +3055,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin, GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin_len, 183296, "gemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a", 320, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin, Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin_len, 183296, "gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a", 320, "b9afcb7beb9cbdf56629ab3e7396c803e13f1a1410e569b60332a123a2aeea2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2808,6 +3068,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826818) , /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2824,6 +3086,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 , /* mMmaKind */ trtllm::gen::MmaKind(4) @@ -2839,11 +3103,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 0 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 0 , /* mTileM */ 128 , /* mTileN */ 128 @@ -2860,9 +3123,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(3) , /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 227328, "gemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 227328, "gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "0b7d9e47ffaf50c12ff6888aba42111a2d731fb6aada6a63f33ea578300a2add", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2870,6 +3136,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2886,6 +3154,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -2901,11 +3171,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2922,9 +3191,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, -{GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin, GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin_len, 224256, "gemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a", 448, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) + }, gemm::SmVersion::Sm100a}, +{Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin, Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin_len, 224256, "gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a", 448, "84964cb97f1ba9d334e25c76018e5ab73cc2f1fcbee4391d7e76f61c52a64b9c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 @@ -2932,6 +3204,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mDtypeA */ trtllm::gen::Dtype(17826828) , /* mDtypeB */ trtllm::gen::Dtype(17827853) , /* mDtypeC */ trtllm::gen::Dtype(17827853) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2948,6 +3222,8 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 , /* mMmaKind */ trtllm::gen::MmaKind(5) @@ -2963,11 +3239,10 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMatrixA */ 0 -, /* mTransposeMatrixB */ 1 , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 8 @@ -2984,12 +3259,12 @@ static const gemm::GemmConfig tllmGenGemmList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) - }, gemm::SmVersion::Sm100a }, + }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 }; // clang-format on - } // namespace kernels } // namespace tensorrt_llm } // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h index 142e9728dc..17199d0f17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParams.h @@ -22,6 +22,10 @@ #include "Enums.h" #include "TmaDescriptor.h" +// NOTE: keep this code dependency free. It has to be included by the device code and has to be +// compilable with NVRTC. +#include "KernelParamsDecl.h" + namespace gemm { @@ -29,535 +33,305 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// - namespace tg = trtllm::gen; -//////////////////////////////////////////////////////////////////////////////////////////////////// - -struct KernelParams +namespace KernelParamsSetup { #ifdef TLLM_ENABLE_CUDA - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Gemm parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - // TMA descriptor for A. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If transposeMatrixA is false - // Logical shape is [M, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileM, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeA. - // - // If transposeMatrixA is true - // Logical shape is [K, M]. - // Logical strides are [M, 1]. - // Tile box shape is [tileK, tileM]. - // Tile box strides are [tileM, 1]. - // Dtype is set from options.mDtypeA. - CUtensorMap tmaA; +using MatrixType = KernelParams::MatrixType; - // TMA descriptor for B. - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. - // - // If transposeMatrixB is true - // Logical shape is [N, K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileN, tileK]. - // Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeB. - // - // If transposeMatrixB is false - // Logical shape is [K, N]. - // Logical strides are [N, 1]. - // Tile box shape is [tileK, tileN]. - // Tile box strides are [tileN, 1]. - // Dtype is set from options.mDtypeB. - CUtensorMap tmaB; +// Create the TMA shape/stride for A/B. +template +static auto makeTmaShapeStrideAb(GemmOptions const& options, MatrixType matrixType) +{ + // The outer dimension. + auto numTokens = (matrixType == MatrixType::MatrixA) ? options.mM : options.mN; + // The outer dimension tile size. + auto tileMn = (matrixType == MatrixType::MatrixA) ? options.mTileM : options.mTileN; + // The inner dimension. + auto hiddenSize = options.mK; + // The cute tensor shape for A/B: (numTokens, hiddenSize). + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - // TMA descriptor for C, (when useTmaStore is true) - // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideC. - // - // If transposeMmaOutput is false, - // Logical shape is [M, N]. - // Logical strides are [N, 1]. - // Tile box shape is [epilogueTileM, epilogueTileN]. - // Tile box strides are [epilogueTileN, 1]. - // Dtype is set from options.mDtypeC. - // - // If transposeMmaOutput is true, - // Logical shape is [N, M]. - // Logical strides are [M, 1]. - // Tile box shape is [epilogueTileN, epilogueTileM]. - // Tile box strides are [epilogueTileM, 1]. - // Dtype is set from options.mDtypeC. - CUtensorMap tmaC; + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + auto stride = std::vector{1, static_cast(hiddenSize)}; - // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for A is always R128c4 - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // K must be a multiple of 4P. - // The "logical" shape is: [M, K / P]. - // The R128c4 layout is: [⌈M / 128⌉, K / P / 4, 512]. - // The shape we use for TMA is: [⌈M / 128⌉, K / P / 4, 2, 256]. - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfA; + // Assemble the box shape + std::vector tileShape = {options.mTileK, tileMn}; - // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. - // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from - // makeTmaShapeStrideSfAb. - // The layout of scaling factors for B is controlled by options.mSfLayoutB. - // - // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. - // The "logical" shape is: [N, K / P] - // - // If the layout is R128c4, - // K must be a multiple of 4P. - // The R128c4 layout is: [⌈N / 128⌉, K / P / 4, 512] - // The shape we use for TMA is: [⌈N / 128⌉, K / P / 4, 2, 256] - // - // If the layout is R8c4, - // K must be a multiple of 4P. - // The R8c4 layout is: [⌈N / 8⌉, K / P / 4, 32] - // The shape we use for TMA is: [⌈N / 8⌉, K / P / 4 / r, r * 32] - // where r = min(tileK / P / 4, 8) - // - // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. - CUtensorMap tmaSfB; - - // The output matrix C. The data type is controlled by options.mDtypeC. - // - // When transposeMmaOutput is true, the shape is [N, M]. - // Otherwise, the shape is [M, N]. - // Elements in a given row are stored contiguously in memory (row-major). - void* ptrC; - - // The block scaling factors to dequantize A. - // - // If DeepSeek FP8 recipe is used: - // If transposeMmaOutput is false, shape is [K / 128, M]. - // Otherwise, shape is [M / 128, K / 128]. - // The rightmost dimension is contiguous in memory. - // - // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: - // The layout and data type is the same as explained in tmaSfA. - // - // Otherwise should be set to nullptr. - void const* ptrSfA; - - // The scaling factors to dequantize B. - // - // If DeepSeek FP8 recipe is used: - // If transposeMmaOutput is false, shape is [N / 128, K / 128]. - // Otherwise, shape is [K / 128, N]. - // The rightmost dimension is contiguous in memory. - // - // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: - // The layout and data type is the same as explained in tmaSfB. - // - // Otherwise should be set to nullptr. - void const* ptrSfB; - - // The per-token scaling factors from scale A. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is not - // transposed). The dtype is Dtype::Bfloat16 - // - // The shape is [M] - void const* ptrPerTokenSfA; - - // The per-token scaling factors from scale B. - // - // This is used for either: - // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 - // * When the routing scales are applied to the input activations (only when output is - // transposed). The dtype is Dtype::Bfloat16 - // - // The shape is [N] - void const* ptrPerTokenSfB; - - // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also - // used for the DeepSeek FP8 recipe. - // - // For DeepSeek FP8 recipe: - // If transposeMmaOutput is false, shape is [N / 128, M]. - // Otherwise, shape is [M / 128, N]. - // The rightmost dimension is contiguous in memory. - // - // For MxFp{4,8} and NvFp4 formats: - // If transposeMmaOutput is false, shape is [M, N / 16]. - // Otherwise, shape is [N, M / 16]. - // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). - void* ptrSfC; - - // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. - // TensorRT-LLM API requires a scaling factor on the device. - // Shape is [1]. - float const* ptrScaleC; - - // The M dimension. - // It is the total number of tokens if A is the activation matrix. - // It is the total number of output channels if A is the weight matrix. - int32_t m; - // The N dimension. - // It is the total number of tokens if B is the activation matrix. - // It is the total number of output channels if B is the weight matrix. - int32_t n; - // The K dimension. It is the hidden dimension of the input matrices. - int32_t k; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // All-reduce parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The rank id of the current device in the multi-gpu space. - int rank; - // The number of peer devices in tensor-parallel group. - int tpGrpSize; - // Pointer for output with multicast mapping. It is used by the "reduce" op (LDGMC.ADD) of the - // two-shot reduce-scatter phase. - // The shape is [M, N] and the dtype is float. - void* multimemC; - - // The barriers in global memory. - // - // The kernel arrives at (with release ordering) the multicast mapping of the barrier to broadcast - // amongst peer devices. It then waits (with acquire ordering) for the unicast mapping of the - // barrier. - // - // Flags in global memory that sync on "entrance" of reduce-scatter phase in two-shot all-reduce. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the unicast memory created with IpcNvlsHandle. - // Must be set to 0 before the kernel launch. - void* ptrTileBars; - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the multicast memory created with IpcNvlsHandle. - void* multimemTileBars; - - // Flags in global memory that sync on "exit" after the all-reduce finishes. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the unicast memory created with IpcNvlsHandle. - // Must be set to 0 before the kernel launch. - void* ptrCompletionBars; - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // The pointer to the multicast memory created with IpcNvlsHandle - void* multimemCompletionBars; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Miscellaneous parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - // The barriers in global memory for Split-k reduction with exchange in GMEM. - // Each CTAs arrives at the barrier and blockIdx.z == gridDim.Z - 1 waits for the barrier to flip - // to perform a reduction. - // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. - // For DeepSeek FP8 recipe, the shape is [numTilesM * numTilesN * 2]. - // The memory must be set to 0 before the kernel launch. - void* ptrSplitKCompletionBars; - - // Pointer to the memory holding the partial sums for split-K in GMEM. - // The shape is [numSlicesForSplitK, numSlicesForSliceK, numTilesM * tileM, numTilesN * tileN]. - // The dtype is dtypeAcc, i.e. float. - void* ptrPartialSumsForSplitK; - - // In some cases, some CTAs need to exit early. E.g. when the grid is statically set, but the - // actual workload is decided at runtime. This device pointer maps to the number of non exiting - // CTAs in the X dim of the grid when transposeMmaOutput is false. And the Y dim, otherwise. - // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. - int32_t* ptrNumNonExitingCtas; - - ////////////////////////////////////////////////////////////////////////////////////////////////// - // - // Miscellaneous parameters. - // - ////////////////////////////////////////////////////////////////////////////////////////////////// - - enum class MatrixType + MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; + if (layout == MatrixLayout::MajorMn) { - MatrixA = 0, - MatrixB - }; - - // Create the TMA shape/stride for A/B. - template - static auto makeTmaShapeStrideAb(GemmOptions const& options, MatrixType matrixType) - { - // The outer dimension. - auto numTokens = (matrixType == MatrixType::MatrixA) ? options.mM : options.mN; - // The inner dimension. - auto hiddenSize = options.mK; - // The cute tensor shape for A/B: (numTokens, hiddenSize). - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; - - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; - // Apply transpose if necessary - if ((matrixType == MatrixType::MatrixA && options.mTransposeMatrixA) - || (matrixType == MatrixType::MatrixB && !options.mTransposeMatrixB)) - { - std::swap(shape[0], shape[1]); - stride[1] = numTokens; - } - - return std::make_tuple(shape, stride); + std::swap(shape[0], shape[1]); + stride[1] = numTokens; + std::swap(tileShape[0], tileShape[1]); } - - // Create the TMA shape/stride for C. - template - static auto makeTmaShapeStrideC(GemmOptions const& options) + else if (layout == MatrixLayout::BlockMajorK) { - // The number of tokens. - auto numTokens = options.mTransposeMmaOutput ? options.mN : options.mM; - // The hidden dimension. - auto hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN; - // Note that TMA descriptor expects the first dimension's stride to be - // 1, so swap the first two dimension so that the hiddenSize dimension comes first. - auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + // Set shapes based on blocking layout + shape = {static_cast(options.mBlockK), static_cast(numTokens), + static_cast(options.mK / options.mBlockK)}; + stride = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK)}; - // Assemble the stride (strideTokens, 1). - // Swap the first two dimension as mentioned before. - auto stride = std::vector{1, static_cast(hiddenSize)}; - - return std::make_tuple(shape, stride); + // If blockK > tileK, then the inner most box size will be based on the tile + int32_t const tileBlockK = std::min(options.mBlockK, options.mTileK); + tileShape = {tileBlockK, tileMn, options.mTileK / tileBlockK}; } - // Create the TMA shape/stride for A/B block scaling factors. - template - static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) + return std::make_tuple(shape, stride, tileShape); +} + +// Create the TMA shape/stride for C. +template +static auto makeTmaShapeStrideC(GemmOptions const& options) +{ + // The number of tokens. + auto numTokens = options.mTransposeMmaOutput ? options.mN : options.mM; + // The hidden dimension. + auto hiddenSize = options.mTransposeMmaOutput ? options.mM : options.mN; + // Note that TMA descriptor expects the first dimension's stride to be + // 1, so swap the first two dimension so that the hiddenSize dimension comes first. + auto shape = std::vector{static_cast(hiddenSize), static_cast(numTokens)}; + + // Assemble the stride (strideTokens, 1). + // Swap the first two dimension as mentioned before. + auto stride = std::vector{1, static_cast(hiddenSize)}; + + return std::make_tuple(shape, stride); +} + +// Create the TMA shape/stride for A/B block scaling factors. +template +static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) +{ + // The outer dimension. + auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; + // The inner dimension. + auto hiddenSize = options.mK; + // The outer tile dimension. + auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; + // The inner tile dimension. + auto hiddenSizePerTile = options.mTileK; + // The dtype of the matrix. + tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; + // Number of elements per scaling factor. + int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; + + switch (layout) { - // The outer dimension. - auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; - // The inner dimension. - auto hiddenSize = options.mK; - // The outer tile dimension. - auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; - // The inner tile dimension. - auto hiddenSizePerTile = options.mTileK; - // The dtype of the matrix. - tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; - // Number of elements per scaling factor. - int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; - - switch (layout) - { - case tg::SfLayout::R128c4: - { - // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. - // The 512B block maps to a 32x16B (32x128b) block in TMEM. - // See https://nvbugspro.nvidia.com/bug/4165523 - // - // Additionally, we have to meet constraints of TMA that the box dimensions are less - // than 256 and boxDim[0] is a multiple of 16B. - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The aforementioned format is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 512] - // The shape we use for TMA is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 2, 256] - - auto shape = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)), - static_cast(tg::ceilDiv(numTokens, 128))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) - { - stride[i] = shape[i - 1] * stride[i - 1]; - } - - auto tileShapes - = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), - static_cast(tg::ceilDiv(numTokensPerTile, 128))}; - - return std::make_tuple(shape, stride, tileShapes); - } - - case tg::SfLayout::R8c4: - { - // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. - // - // As the inner dimension (k) is required to be a multiple of the tile size, we - // can reshape to use fewer read requests, if the tile dimensions allow. - // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) - // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 32] - // The TMA tensor shape is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf * r), r * 32] - - int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), 8); - - auto shape = std::vector{static_cast(repeats * 32), - static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), - static_cast(tg::ceilDiv(numTokens, 8))}; - - std::vector stride(shape.size()); - stride[0] = 1; - for (size_t i = 1; i < shape.size(); i++) - { - stride[i] = shape[i - 1] * stride[i - 1]; - } - - auto tileShapes = std::vector{static_cast(repeats * 32), - static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), - static_cast(tg::ceilDiv(numTokensPerTile, 8))}; - - return std::make_tuple(shape, stride, tileShapes); - } - - default: throw std::runtime_error("Unsupported SF layout"); - } - return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); - } - - // Setup the kernel parameters. - template - static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, - void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void* ptrC, - void* ptrSfC, void* multimemC, float* ptrScaleC, void* ptrPartialSumsForSplitK, void* ptrTileBars, - void* multimemTileBars, void* ptrCompletionBars, void* multimemCompletionBars, void* ptrSplitKCompletionBars, - int32_t* ptrNumNonExitingCtas, int rank, int tpGrpSize) + case tg::SfLayout::R128c4: { + // The scaling factor tensor packs 128x4 tiles into contiguous 512B blocks. + // The 512B block maps to a 32x16B (32x128b) block in TMEM. + // See https://nvbugspro.nvidia.com/bug/4165523 + // + // Additionally, we have to meet constraints of TMA that the box dimensions are less + // than 256 and boxDim[0] is a multiple of 16B. + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The aforementioned format is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 512] + // The shape we use for TMA is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 2, 256] - // Is one-shot all-reduce? - bool const oneShotAr{options.mAllReduceAlgo == AllReduceAlgo::OneShot}; - // Is two-shot all-reduce? - bool const twoShotAr{options.mAllReduceAlgo == AllReduceAlgo::TwoShot}; - // Are there peer devices? - bool const multiDevice{tpGrpSize > 1}; + auto shape = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)), + static_cast(tg::ceilDiv(numTokens, 128))}; - // Create the return struct. - KernelParams params; - - // Shape/stride for gmem tensor A. - auto [shapeA, strideA] = makeTmaShapeStrideAb(options, MatrixType::MatrixA); - // Build tma descriptor for A. - params.tmaA = gemm::buildNdTmaDescriptor(options.mDtypeA, options.mMmaKind, shapeA, strideA, - options.mTransposeMatrixA ? options.mTileK : options.mTileM, - options.mTransposeMatrixA ? options.mTileM : options.mTileK, const_cast(ptrA)); - - // Shape/stride for gmem tensor B. - auto [shapeB, strideB] = makeTmaShapeStrideAb(options, MatrixType::MatrixB); - // Build tma descriptor for B. - params.tmaB = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB, - !options.mTransposeMatrixB ? options.mTileK : options.mTileN, - !options.mTransposeMatrixB ? options.mTileN : options.mTileK, const_cast(ptrB), - /* swizzle */ !options.mSliceK); - - if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE2m1 - || options.mDtypeA == tg::Dtype::MxE4m3) + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) { - tg::Dtype const dTypeSfA = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; - - // Build TMA descriptor for gmem A block scaling factors. - auto [shapeSfA, strideSfA, tileShapesSfA] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); - params.tmaSfA - = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); + stride[i] = shape[i - 1] * stride[i - 1]; } - if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE2m1 - || options.mDtypeB == tg::Dtype::MxE4m3) - { - tg::Dtype const dTypeSfB = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + auto tileShapes + = std::vector{256, 2, static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4)), + static_cast(tg::ceilDiv(numTokensPerTile, 128))}; - // Build TMA descriptor for gmem B block scaling factors. - auto [shapeSfB, strideSfB, tileShapesSfB] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); - params.tmaSfB - = gemm::buildSfTmaDescriptor(dTypeSfB, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); - } - - if (options.mUseTmaStore) - { - // Shape/stride for gmem tensor C. - auto [shapeC, strideC] = makeTmaShapeStrideC(options); - - // Swap M and N tiles for the M-major epilogue. - auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; - auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; - - // One-shot performs TMA reduction on multicast mapping of the output buffer directly. - // Two-shot performs TMA store on unicast mapping of the output buffer. The reduction happens - // in the next phase. - void* ptrTmaC{oneShotAr && multiDevice ? multimemC : ptrC}; - auto dtypeC{options.mDtypeC}; - // Regardless of output dtype, two-shot all-reduce store partial - // accumulation results to global memory in float32 precision. - if (twoShotAr && multiDevice) - { - dtypeC = options.mDtypeAcc; - } - - // Build tma descriptor for C. - params.tmaC = gemm::buildNdTmaDescriptor( - dtypeC, tg::MmaKind::Auto, shapeC, strideC, outputTileM, outputTileN, const_cast(ptrTmaC)); - } - - // Set the dequantization factors for A and B when DeepSeek FP8 recipe is used. - params.ptrSfA = ptrSfA; - params.ptrSfB = ptrSfB; - - // Set the per-token scale factors for MetaFP8 or scale inputs - params.ptrPerTokenSfA = ptrPerTokenSfA; - params.ptrPerTokenSfB = ptrPerTokenSfB; - - // Also set ptrC (it may be used by the NCCL reduction code in "layers/Llama"). - params.ptrC = ptrC; - params.ptrScaleC = ptrScaleC; - - // The block scaling factors of C for MxFp{4,8} and NvFp4 formats. - // (not to be confused with the tensor-level scaling factor stored in ptrScaleC) - params.ptrSfC = ptrSfC; - - params.m = options.mM; - params.n = options.mN; - params.k = options.mK; - - params.rank = rank; - params.tpGrpSize = tpGrpSize; - - params.multimemC = multimemC; - params.ptrPartialSumsForSplitK = ptrPartialSumsForSplitK; - params.ptrTileBars = ptrTileBars; - params.multimemTileBars = multimemTileBars; - params.ptrCompletionBars = ptrCompletionBars; - params.multimemCompletionBars = multimemCompletionBars; - - params.ptrSplitKCompletionBars = ptrSplitKCompletionBars; - params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; - return params; + return std::make_tuple(shape, stride, tileShapes); } - // Setup the kernel parameters. - template - static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrB, void* ptrC, - void* multimemC, float const* ptrScaleC, void* ptrTileBars, void* multimemTileBars, void* ptrCompletionBars, - void* multimemCompletionBars, int rank, int tpGrpSize) + case tg::SfLayout::R8c4: { - return setKernelParams(options, ptrA, nullptr, ptrB, nullptr, ptrC, multimemC, ptrScaleC, ptrTileBars, - multimemTileBars, ptrCompletionBars, multimemCompletionBars, rank, tpGrpSize); + // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. + // + // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use + // fewer read requests, if the tile dimensions allow. It does not reduce the number of + // instructions. + // + // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) + // + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] + // + // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of + // NumRepeats * numEltsPerSf * 4. + + // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. + int const r = options.mSfReshapeFactor; + if (r > 0 && (r & (r - 1)) != 0) + { + throw std::runtime_error("mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); + } + + // Sanitize number of repeats so it doesn't exceed the dimension. + int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); + + // Detect if the input hidden size K is a multiple of the repeats. + if (tg::ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) + { + throw std::runtime_error("SF hiddenSize K (" + std::to_string(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)) + + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); + } + + auto shape = std::vector{static_cast(repeats * 32), + static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), + static_cast(tg::ceilDiv(numTokens, 8))}; + + std::vector stride(shape.size()); + stride[0] = 1; + for (size_t i = 1; i < shape.size(); i++) + { + stride[i] = shape[i - 1] * stride[i - 1]; + } + + auto tileShapes = std::vector{static_cast(repeats * 32), + static_cast(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4 * repeats)), + static_cast(tg::ceilDiv(numTokensPerTile, 8))}; + + return std::make_tuple(shape, stride, tileShapes); } + + default: throw std::runtime_error("Unsupported SF layout"); + } + return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); +} + +// Setup the kernel parameters. +template +static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, + void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void const* ptrBias, + void* ptrC, void* ptrSfC, void* multimemC, float* ptrScaleC, void* ptrPartialSumsForSplitK, void* ptrTileBars, + void* multimemTileBars, void* ptrCompletionBars, void* multimemCompletionBars, void* ptrSplitKCompletionBars, + int32_t* ptrNumNonExitingCtas, int rank, int tpGrpSize) +{ + + // Is one-shot all-reduce? + bool const oneShotAr{options.mAllReduceAlgo == AllReduceAlgo::OneShot}; + // Is two-shot all-reduce? + bool const twoShotAr{options.mAllReduceAlgo == AllReduceAlgo::TwoShot}; + // Are there peer devices? + bool const multiDevice{tpGrpSize > 1}; + + // Create the return struct. + KernelParams params; + + // Shape/stride for gmem tensor A. + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAb(options, MatrixType::MatrixA); + // Build tma descriptor for A. + params.tmaA = gemm::buildNdTmaDescriptor( + options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); + + // Shape/stride for gmem tensor B. + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAb(options, MatrixType::MatrixB); + // Build tma descriptor for B. + params.tmaB = gemm::buildNdTmaDescriptor(options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, + const_cast(ptrB), + /* swizzle */ !options.mSliceK); + + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE2m1 + || options.mDtypeA == tg::Dtype::MxE4m3) + { + tg::Dtype const dTypeSfA = (options.mDtypeA == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + // Build TMA descriptor for gmem A block scaling factors. + auto [shapeSfA, strideSfA, tileShapesSfA] + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); + params.tmaSfA + = gemm::buildSfTmaDescriptor(dTypeSfA, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); + } + + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE2m1 + || options.mDtypeB == tg::Dtype::MxE4m3) + { + tg::Dtype const dTypeSfB = (options.mDtypeB == tg::Dtype::E2m1) ? tg::Dtype::E4m3 : tg::Dtype::UE8m0; + + // Build TMA descriptor for gmem B block scaling factors. + auto [shapeSfB, strideSfB, tileShapesSfB] + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); + params.tmaSfB + = gemm::buildSfTmaDescriptor(dTypeSfB, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); + } + + if (options.mUseTmaStore) + { + // Shape/stride for gmem tensor C. + auto [shapeC, strideC] = makeTmaShapeStrideC(options); + + // Swap M and N tiles for the M-major epilogue. + auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; + auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; + + // One-shot performs TMA reduction on multicast mapping of the output buffer directly. + // Two-shot performs TMA store on unicast mapping of the output buffer. The reduction happens + // in the next phase. + void* ptrTmaC{oneShotAr && multiDevice ? multimemC : ptrC}; + auto dtypeC{options.mDtypeC}; + // Regardless of output dtype, two-shot all-reduce store partial + // accumulation results to global memory in float32 precision. + if (twoShotAr && multiDevice) + { + dtypeC = options.mDtypeAcc; + } + + // Build tma descriptor for C. + params.tmaC = gemm::buildNdTmaDescriptor(dtypeC, tg::MmaKind::Auto, shapeC, strideC, + std::vector{outputTileN, outputTileM}, const_cast(ptrTmaC)); + } + + // Set the dequantization factors for A and B when DeepSeek FP8 recipe is used. + params.ptrSfA = ptrSfA; + params.ptrSfB = ptrSfB; + + // Set the per-token scale factors for MetaFP8 or scale inputs + params.ptrPerTokenSfA = ptrPerTokenSfA; + params.ptrPerTokenSfB = ptrPerTokenSfB; + + // Set the bias. + params.ptrBias = ptrBias; + + // Also set ptrC (it may be used by the NCCL reduction code in "layers/Llama"). + params.ptrC = ptrC; + params.ptrScaleC = ptrScaleC; + + // The block scaling factors of C for MxFp{4,8} and NvFp4 formats. + // (not to be confused with the tensor-level scaling factor stored in ptrScaleC) + params.ptrSfC = ptrSfC; + + params.m = options.mM; + params.n = options.mN; + params.k = options.mK; + + params.rank = rank; + params.tpGrpSize = tpGrpSize; + + params.multimemC = multimemC; + params.ptrPartialSumsForSplitK = ptrPartialSumsForSplitK; + params.ptrTileBars = ptrTileBars; + params.multimemTileBars = multimemTileBars; + params.ptrCompletionBars = ptrCompletionBars; + params.multimemCompletionBars = multimemCompletionBars; + + params.ptrSplitKCompletionBars = ptrSplitKCompletionBars; + params.ptrNumNonExitingCtas = ptrNumNonExitingCtas; + return params; +} #endif -}; +}; // namespace KernelParamsSetup //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h new file mode 100644 index 0000000000..f248278acc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelParamsDecl.h @@ -0,0 +1,324 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +// NOTE: keep this code dependency free. It has to be included by the device code and has to be +// compilable with NVRTC. + +namespace gemm +{ + +namespace gemm +{ + +struct KernelParams +{ +#ifdef TLLM_ENABLE_CUDA + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Gemm parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // TMA descriptor for A. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAb. + // + // If layoutA is MatrixLayout::MajorK + // Logical shape is [M, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeA. + // + // If layoutA is MatrixLayout::MajorMn + // Logical shape is [K, M]. + // Logical strides are [M, 1]. + // Tile box shape is [tileK, tileM]. + // Tile box strides are [tileM, 1]. + // Dtype is set from options.mDtypeA. + // + // If layoutA is MatrixLayout::BlockMajorK + // Logical shape is [K / blockK, M, blockK]. + // Logical strides are [M * blockK, blockK, 1]. + // Tile box shape is [tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. + // Tile box strides are [tileM * min(blockK, tileK), min(blockK, tileK), 1]. + // Dtype is set from options.mDtypeA, and blockK is 128B. + CUtensorMap tmaA; + + // TMA descriptor for B. + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideAb. + // + // If layoutB is MatrixLayout::MajorK + // Logical shape is [N, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeB. + // + // If layoutB is MatrixLayout::MajorMn + // Logical shape is [K, N]. + // Logical strides are [N, 1]. + // Tile box shape is [tileK, tileN]. + // Tile box strides are [tileN, 1]. + // Dtype is set from options.mDtypeB. + // + // If layoutB is MatrixLayout::BlockMajorK + // Logical shape is [K / blockK, N, blockK]. + // Logical strides are [N * blockK, blockK, 1]. + // Tile box shape is [tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. + // Tile box strides are [tileN * min(blockK, tileK), min(blockK, tileK), 1]. + // Dtype is set from options.mDtypeB, and blockK is 128B. + CUtensorMap tmaB; + + // TMA descriptor for C, (when useTmaStore is true) + // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from + // makeTmaShapeStrideC. + // + // If transposeMmaOutput is false, + // Logical shape is [M, N]. + // Logical strides are [N, 1]. + // Tile box shape is [epilogueTileM, epilogueTileN]. + // Tile box strides are [epilogueTileN, 1]. + // Dtype is set from options.mDtypeC. + // + // If transposeMmaOutput is true, + // Logical shape is [N, M]. + // Logical strides are [M, 1]. + // Tile box shape is [epilogueTileN, epilogueTileM]. + // Tile box strides are [epilogueTileM, 1]. + // Dtype is set from options.mDtypeC. + CUtensorMap tmaC; + + // TMA descriptor for the block scaling factors for A, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for A is always R128c4 + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // K must be a multiple of 4P. + // The "logical" shape is: [M, K / P]. + // The R128c4 layout is: [⌈M / 128⌉, K / P / 4, 512]. + // The shape we use for TMA is: [⌈M / 128⌉, K / P / 4, 2, 256]. + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfA; + + // TMA descriptor for the block scaling factors for B, for MxFp{4,8} and NvFp4 formats. + // Must be setup using gemm::buildSfTmaDescriptor with shapes and strides from + // makeTmaShapeStrideSfAb. + // The layout of scaling factors for B is controlled by options.mSfLayoutB. + // + // Let P be the number of elements per SF. P=16 for NvFp4, P=32 for Mx formats. + // The "logical" shape is: [N, K / P] + // + // If the layout is R128c4, + // K must be a multiple of 4P. + // The R128c4 layout is: [⌈N / 128⌉, K / P / 4, 512] + // The shape we use for TMA is: [⌈N / 128⌉, K / P / 4, 2, 256] + // + // If the layout is R8c4, + // K must be a multiple of 4P. + // The R8c4 layout is: [⌈N / 8⌉, K / P / 4, 32] + // The shape we use for TMA is: [⌈N / 8⌉, K / P / 4 / r, r * 32] + // where r = min(tileK / P / 4, 8) + // + // Dtype is Dtype::E4m3 for NvFp4, Dtype::UE8m0 for Mx formats. + CUtensorMap tmaSfB; + + // The output matrix C. The data type is controlled by options.mDtypeC. + // + // When transposeMmaOutput is true, the shape is [N, M]. + // Otherwise, the shape is [M, N]. + // Elements in a given row are stored contiguously in memory (row-major). + void* ptrC; + + // The block scaling factors to dequantize A. + // + // If DeepSeek FP8 recipe is used: + // If transposeMmaOutput is false, shape is [K / 128, M]. + // Otherwise, shape is [M / 128, K / 128]. + // The rightmost dimension is contiguous in memory. + // + // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: + // The layout and data type is the same as explained in tmaSfA. + // + // Otherwise should be set to nullptr. + void const* ptrSfA; + + // The scaling factors to dequantize B. + // + // If DeepSeek FP8 recipe is used: + // If transposeMmaOutput is false, shape is [N / 128, K / 128]. + // Otherwise, shape is [K / 128, N]. + // The rightmost dimension is contiguous in memory. + // + // If DeepSeek FP8 recipe is not used, but for MxFp{4,8} and NvFp4 formats: + // The layout and data type is the same as explained in tmaSfB. + // + // Otherwise should be set to nullptr. + void const* ptrSfB; + + // The bias applied after the GEMM. + // The bias is applied before applying the global scaling factor. I.e. + // C' = (A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // if BiasType is N, the shape is [N]. + // The bias is broadcasted along the M dimension. + // + // if BiasType is M, the shape is [M]. + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* ptrBias; + + // The per-token scaling factors from scale A. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is not + // transposed). The dtype is Dtype::Bfloat16 + // + // The shape is [M] + void const* ptrPerTokenSfA; + + // The per-token scaling factors from scale B. + // + // This is used for either: + // * Per-token scaling factor quantization schemes, such as MetaFP8. The dtype is Dtype::Float32 + // * When the routing scales are applied to the input activations (only when output is + // transposed). The dtype is Dtype::Bfloat16 + // + // The shape is [N] + void const* ptrPerTokenSfB; + + // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also + // used for the DeepSeek FP8 recipe. + // + // For DeepSeek FP8 recipe: + // If transposeMmaOutput is false, shape is [N / 128, M]. + // Otherwise, shape is [M / 128, N]. + // The rightmost dimension is contiguous in memory. + // + // For MxFp{4,8} and NvFp4 formats: + // If transposeMmaOutput is false, shape is [M, N / 16]. + // Otherwise, shape is [N, M / 16]. + // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). + void* ptrSfC; + + // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. + // TensorRT-LLM API requires a scaling factor on the device. + // Shape is [1]. + float const* ptrScaleC; + + // The M dimension. + // It is the total number of tokens if A is the activation matrix. + // It is the total number of output channels if A is the weight matrix. + int32_t m; + // The N dimension. + // It is the total number of tokens if B is the activation matrix. + // It is the total number of output channels if B is the weight matrix. + int32_t n; + // The K dimension. It is the hidden dimension of the input matrices. + int32_t k; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // All-reduce parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The rank id of the current device in the multi-gpu space. + int rank; + // The number of peer devices in tensor-parallel group. + int tpGrpSize; + // Pointer for output with multicast mapping. It is used by the "reduce" op (LDGMC.ADD) of the + // two-shot reduce-scatter phase. + // The shape is [M, N] and the dtype is float. + void* multimemC; + + // The barriers in global memory. + // + // The kernel arrives at (with release ordering) the multicast mapping of the barrier to broadcast + // amongst peer devices. It then waits (with acquire ordering) for the unicast mapping of the + // barrier. + // + // Flags in global memory that sync on "entrance" of reduce-scatter phase in two-shot all-reduce. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the unicast memory created with IpcNvlsHandle. + // Must be set to 0 before the kernel launch. + void* ptrTileBars; + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the multicast memory created with IpcNvlsHandle. + void* multimemTileBars; + + // Flags in global memory that sync on "exit" after the all-reduce finishes. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the unicast memory created with IpcNvlsHandle. + // Must be set to 0 before the kernel launch. + void* ptrCompletionBars; + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // The pointer to the multicast memory created with IpcNvlsHandle + void* multimemCompletionBars; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Miscellaneous parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + // The barriers in global memory for Split-k reduction with exchange in GMEM. + // Each CTAs arrives at the barrier and blockIdx.z == gridDim.Z - 1 waits for the barrier to flip + // to perform a reduction. + // The shape is [numTilesM * numTilesN] and the dtype is uint32_t. + // For DeepSeek FP8 recipe, the shape is [numTilesM * numTilesN * 2]. + // The memory must be set to 0 before the kernel launch. + void* ptrSplitKCompletionBars; + + // Pointer to the memory holding the partial sums for split-K in GMEM. + // The shape is [numSlicesForSplitK, numSlicesForSliceK, numTilesM * tileM, numTilesN * tileN]. + // The dtype is dtypeAcc, i.e. float. + void* ptrPartialSumsForSplitK; + + // In some cases, some CTAs need to exit early. E.g. when the grid is statically set, but the + // actual workload is decided at runtime. This device pointer maps to the number of non exiting + // CTAs in the X dim of the grid when transposeMmaOutput is false. And the Y dim, otherwise. + // The pointer points to a scalar and the dtype is int32_t. The pointed value must be >= 0. + int32_t* ptrNumNonExitingCtas; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // + // Miscellaneous parameters. + // + ////////////////////////////////////////////////////////////////////////////////////////////////// + + enum class MatrixType + { + MatrixA = 0, + MatrixB + }; +#endif +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm + +} // namespace gemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h index 9a4db96c7c..3f3b915eee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/KernelTraits.h @@ -20,6 +20,7 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" #include +#include namespace gemm { @@ -77,6 +78,38 @@ public: } // Returns the offset of the ith chunk + int32_t getChunkOffsetByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getChunkOffset(ii); + } + } + throw std::runtime_error("Name not found: " + name); + } + + // Returns the first chunk reuse flag given chunk name. + int getFirstChunkReuseFlagByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getFirstChunkReuseFlag(ii); + } + } + throw std::runtime_error("Name not found: " + name); + } + + // Function to calculate the total size of the SMEM array + int32_t getTotalSize() const + { + return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); + } + +private: int32_t getChunkOffset(int32_t ii) const { if (mFirstChunkReuse[ii]) @@ -91,12 +124,6 @@ public: return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); } - // Function to calculate the total size of the SMEM array - int32_t getTotalSize() const - { - return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); - } - // Returns the first chunk reuse flag for the ith chunk. int getFirstChunkReuseFlag(int32_t ii) const { @@ -139,9 +166,7 @@ int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) { if (mmaKind == tg::MmaKind::Auto) { - std::cout << "mmaKind != tg::MmaKind::Auto" << std::endl; - assert(false); - return -1; + throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); } if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) { @@ -162,11 +187,12 @@ public: KernelTraits() {} // The constructor. - KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::MmaKind mmaKind, - int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, - int32_t numStagesMma, int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, - bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, - bool usePerTokenSfA, bool usePerTokenSfB) + KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, + int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK, + int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput, + AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA, + bool usePerTokenSfB, BiasType biasType) : mMmaKind{mmaKind} { // @@ -181,16 +207,17 @@ public: // [rowMax ] (16B aligned) (if needed) // [sliceK ] (16B aligned) (if needed) // [per-token SF ] (16B aligned) (if needed) + // [bias ] (16B aligned) (if needed) // // SMEM for smemA and smemB might be repurposed and used for gmemC0 and gmemC1: // // [..smemA..][..smemB..][..smemBShuffle..] - // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..] + // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..][..per-token SF..][..bias..] // if (mMmaKind == tg::MmaKind::Auto) { - mMmaKind = dtypeGetMmaKind(dtypeA, dtypeB); + mMmaKind = dtypeGetMmaKind(dtypeMmaA, dtypeMmaB); } std::vector> numBytesAndAlignmentPerSmemChunk; @@ -344,6 +371,29 @@ public: firstChunkReuseSmem.emplace_back(false); } + // Bias + { + int32_t numBytesSmemBias = 0; + if (isBiasTypeN(biasType)) + { + numBytesSmemBias = tileN * sizeof(float); + } + else if (isBiasTypeM(biasType)) + { + numBytesSmemBias = tileM * sizeof(float); + } + else if (isBiasTypeMn(biasType)) + { + numBytesSmemBias = tileM * tileN * sizeof(float); + } + // Number of bytes alignment for bias + auto const numBytesAlignmentBias = 16; + // Add info. + smemChunkNames.emplace_back("smemBias"); + numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); + firstChunkReuseSmem.emplace_back(false); + } + // Per-block absolute maximum for multi-warp reduction. { // Number of bytes: number of epilogue warps * number of tile columns. @@ -358,6 +408,25 @@ public: firstChunkReuseSmem.emplace_back(false); } + // SmemConstSfBuf + // A buffer used to copy constant values to TMEM. + { + // Do we need the buffer? + bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; + // Number of bytes for the buffer. + auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; + // Number of bytes for the alignment of the buffer. + auto const numBytesAlignmentConstSfBuf = 16; + // No need to reuse the first chunk. + auto const reuseChunksSmemConstSfBuf = false; + + // Add info. + smemChunkNames.emplace_back("smemConstSfBuf"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); + firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); + } + // Create SMEM helper object. mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); @@ -401,10 +470,12 @@ public: // Matrix A { + // We use TMEM for A if we use slice-K or if we need to cast A. + bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); // Number of columns for A. - auto const numTmemColsA = numSlicesForSliceK > 1 ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeA)) - : 0; + auto const numTmemColsA = useTmemA ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) + : 0; // Number of columns for A alignment. auto const numColsAlignmentA = 4; // No need to reuse TMEM. @@ -418,12 +489,16 @@ public: // Sf A { - bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeA); + // Does the MMA require block scales in TMEM for A? + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); + // Are the block scales constant? + bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); // Number of columns for scaling factors of A. - auto const numTmemColsSfA - = useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0; + auto const numTmemColsSfA = useConstSfA + ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileM, 64), 4) + : (useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0); // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 2; + auto const numColsAlignmentSfA = 4; // No need to reuse TMEM. auto const reuseChunksTmemSfA = false; @@ -435,12 +510,16 @@ public: // Sf B { - bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeB); + // Does the MMA require block scales in TMEM for B? + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); + // Are the block scales constant? + bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); // Number of columns for scaling factors of B. - auto const numTmemColsSfB - = useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0; + auto const numTmemColsSfB = useConstSfB + ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileN, 64), 4) + : (useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0); // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 2; + auto const numColsAlignmentSfB = 4; // No need to reuse TMEM. auto const reuseChunksTmemSfB = false; @@ -487,14 +566,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(0); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(1); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -508,50 +587,63 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(2); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(5); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(6); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(7); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetBias(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(8); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -562,28 +654,28 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(0); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(1); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(2); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(3); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h index 0b7574260e..a246ac35b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/TmaDescriptor.h @@ -41,14 +41,14 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, - std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr, bool doSwizzle = true) + std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) { // The multiplication factor of the data padding in SMEM. int32_t padMultiplier = 1; CUtensorMap desc{}; // The data type. CUtensorMapDataType tmaDataFormat{CU_TENSOR_MAP_DATA_TYPE_FLOAT32}; - if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3) + if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3 || dtype == tg::Dtype::UE8m0) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8; } @@ -71,15 +71,11 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st padMultiplier = 2; tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } - else if (mmaKind == tg::MmaKind::MxFp4NvFp4 || mmaKind == tg::MmaKind::Auto) - { - tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; - } else { - std::cerr << "Invalid dtype / mmaKind combination " << tg::dtypeToString(dtype) << "/" - << tg::mmaKindToString(mmaKind) << std::endl; - assert(false); + // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision + // type such as Bfloat16 before the MMA. + tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } } else if (dtype == tg::Dtype::Fp32) @@ -94,24 +90,30 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; + int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; if (doSwizzle) { - if ((tileKSizeInBytes % 128) == 0) + if ((fastestDimTileSizeBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((tileKSizeInBytes % 64) == 0) + else if ((fastestDimTileSizeBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((tileKSizeInBytes % 32) == 0) + else if ((fastestDimTileSizeBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; + // This path is only for the scaling factors. + } + else if ((fastestDimTileSizeBytes % 16) == 0 && (dtype == tg::Dtype::UE8m0 || dtype == tg::Dtype::E4m3)) + { + swizzleType = CU_TENSOR_MAP_SWIZZLE_NONE; } else { - std::cerr << "buildNdTmaDescriptor: unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes + << std::endl; assert(false); } } @@ -121,8 +123,9 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions. - assert(dim == 2 || dim == 3); + // Expect 2 dimensions for regular gemm, 3 dimensions for batched gemm or blocked layout, and 4 + // dimensions for batched gemm with blocked layout. + assert(dim == 2 || dim == 3 || dim == 4); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -147,59 +150,74 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); + auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); - // Build tile shapes. - std::vector tileShapes(dim, 1); - tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK - tileShapes[1] = tileSizeMn; // tileSizeMn + // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. + assert(static_cast(tileShapes.size()) <= dim); + std::vector boxDim(dim, 1); + boxDim[0] = numEltsInClampedFastestTileSize; + for (size_t ii = 1; ii < tileShapes.size(); ++ii) + { + if (tileShapes[ii] > 256) + { + std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; + assert(false); + } + else + { + boxDim[ii] = tileShapes[ii]; + } + } // Set tile strides to 1; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; + char const* errorString; + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "Shape: "; + ss << "Shape: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << shapes[ii] << " "; + ss << shapes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "Stride: "; + ss << "Stride: "; for (int ii = 0; ii < dim - 1; ++ii) { - std::cerr << stridesInBytes[ii] << " "; + ss << stridesInBytes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes: "; + ss << "tileShapes: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileShapes[ii] << " "; + ss << boxDim[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides: "; + ss << "tileStrides: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileStrides[ii] << " "; + ss << tileStrides[ii] << " "; } - std::cerr << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << std::endl; + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; @@ -267,41 +285,44 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; + char const* errorString; + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "shape:"; + ss << "shape:"; for (uint32_t shape_i : shapes) { - std::cerr << " " << shape_i; + ss << " " << shape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "stridesInBytes:"; + ss << "stridesInBytes:"; for (uint32_t stride_i : stridesInBytes) { - std::cerr << " " << stride_i; + ss << " " << stride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes:"; + ss << "tileShapes:"; for (uint32_t tileShape_i : tileShapes) { - std::cerr << " " << tileShape_i; + ss << " " << tileShape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides:"; + ss << "tileStrides:"; for (uint32_t tileStride_i : tileStrides) { - std::cerr << " " << tileStride_i; + ss << " " << tileStride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json index fbbcdfa059..d502017fc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/config.json @@ -12,7 +12,6 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 4, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -43,7 +42,6 @@ "epilogueTileM": 128, "epilogueTileN": 128, "numStages": 3, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -75,7 +73,6 @@ "epilogueTileM": 64, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -105,7 +102,6 @@ "epilogueTileM": 128, "epilogueTileN": 128, "numStages": 3, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -135,7 +131,6 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, "numSlicesForSplitK": 2, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -182,7 +177,6 @@ "numStagesMma": 2, "numStagesMmaWithinWorkTile": 2, "useTwoMmaWarps": true, - "useMetaFp8": false, "usePdl": true }, "GemmDeepSeekFp8Throughput": { @@ -212,7 +206,6 @@ "numStagesMma": 2, "numStagesMmaWithinWorkTile": 2, "useTwoMmaWarps": true, - "useMetaFp8": false, "usePdl": true, "gridTriggerSecondaryA": true, "gridTriggerSecondaryB": false, @@ -232,7 +225,6 @@ "epilogueTileM": 128, "epilogueTileN": 8, "numStages": 3, - "numMmaStages": 1, "numSlicesForSplitK": 1, "useTwoTmaLoadWarps": true, "clusterDimX": 1, @@ -246,7 +238,6 @@ "useCustomMmaSchedule": true, "sfLayoutB": "8x4", "sfLayoutC": "8x4", - "useMetaFp8": false, "gridTriggerSecondaryB": true, "gridWaitForPrimaryA": false, "gridWaitForPrimaryB": true, diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 3d32c2ee25..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:856ce9e462068d464a244eb5179277c6aeb4eba8c9767b354d664eb6eafee0d3 -size 416980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 1657a0701f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c461d5767472f619e7cffd41cc609bb9bf244b78342c55a1b42ae344ccc87292 -size 523680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index c21ee7f925..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d393b7e86991ea2757655b479ef75bfe660f3a1846f46c38e6f55c6ba9d6a25 -size 558316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 2f35e76621..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ecd12d9d9e7d4cec0e7c530e72328420c868f37bab285ed55864776fc6eeec7 -size 304696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 15535b511a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff5b953f3226300d647adc3328d04fae0888b2de91f39a27f5ce7efc6f88f15e -size 401032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 6c80cc5381..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f921e229f0d48546a2087d02f526e0c5c8d5189696ec2e71349227182d1bee0 -size 438480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 2a9bb9cb19..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1acf78f5c7f9505a95f782a4c781c94a9b34bb5958c8f511f32058da40f81868 -size 418890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index d191118f3b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:32f834741c1c2f721409b71b5aaa45b79e7d337c5fc422af33a1bbe1b56b3da5 -size 455548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 9708ab6fef..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:87785c72e84d52ff962f252e98868e1cc3f2595aaa1e9aaf2924fa50e886aba2 -size 458160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 00515a3f3b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6055448d6d6cbb547b3d5656fecb5044465d88be3121e42e6b3c39f96e3bd828 -size 495608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 9f4c2f4187..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df7b94d53bc5517b94f6c2c5c7e6108695a32809cbc55e7d83124f07c06a786c -size 426334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 4179145568..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b64cf6bf660b14299c1957170426f089af93e19fccc93295fd32f0c5df77951d -size 463830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index bacabe8b3a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9359453507f25d7d90bf1f6ac4a453756ba9f8006d6c76fcb3ff09a1ba8cf71a -size 305610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 76d4f50448..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:52a9dc57d86dc176ad59234a764959c0ebec01d9738889a8989fbfca925cf72f -size 338088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index daef06303d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Bfloat16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:631e364a1ae1a29d386f624849176118c5d4f7b01e38f7c973f190d89e7136f2 -size 506554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 5d0f3377e6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d52dfd5ac422362ede96cbda888383f4452df7dc39d6653f8529a560d6b12d37 -size 687361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index adfbdfa4ad..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b8ed23721a4d1eb0260e36dff2bad3cb1b603d287ef7584cf18c5db73ee869f -size 722835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 354888f282..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12cea6d18fe0c95f6eb073e3956ca785835b2740ee65b7d8f934313f709cae87 -size 317072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index e52810e22e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5901f4239aff1497ca81fd9f853fdb35fb2a14ab35c89b1acbde9c87fc909da1 -size 423322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 005124d823..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db42b1e4d61dc69b6a5ac4304fb3e89eb04e9c858af216ee88f877099f400013 -size 460770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index f280d1b90d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca8b1eb8b32d9211ed9f4842548e57128ee6ede550d4dcc3b6ac804de45a9f2b -size 460470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index c458bd9b23..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70dc60c09fae6aa9bfa809107032a931b5edfc211d66b374b5092192c902c222 -size 497918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 3ac72b581f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae575f338e0db3bb053f9cfcee4804a75611b330b87718e18f7692f865ca6984 -size 538960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 9741082990..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c59fa6c8e96dedcb4956d1fbe3282ed1f809dcd34499bb9b4d7b06bc168b579d -size 575568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 0ef28473e5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c6028820f760089f09b0a2e4b32fa0c76725f56ebdfd97fc6c53331616282d6 -size 438116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 516f77e48a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0760d0c99a44691f281310268a6751ea62292b2205719b285ff9e7429fcffaf1 -size 475614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 03aad1c232..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:173abead241a103af02a25daf432d028324561d57f64d9a6d2087cd444c45758 -size 304814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 02617fa32b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_E4m3_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:92b924875fff3571d0efe3478f2fdddb11a8ec60796830f3dd172a7b4f24acac -size 335712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index 9c05d1e6c2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba1a393cd48175f1cfc18436435ea638de675e0d98847510d97bd03d8bae234d -size 421756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 406e016af2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:54e20ad645aafe8ff45abd1c6c8d418c6d0f2c7cd38eb01d6969a32b7cea60de -size 528458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index f475229583..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x128u2_epilogueTile64x128_mma64x128x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b74282de9bcfced0c173d9aee838a54c8ba286bbc84719fcdb81dd9cbac7f15 -size 563882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index c765b304d3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x32_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9527b1cee4c45bf8d4346493fddb1b0f8dd3e6abf90ab358dfdf8864533a130 -size 309474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 96c0d7a7ac..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8669d845fb8bdacda6073d674ca9325a0b241591e1d020020b81977006cf71f -size 400234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 872c8bccec..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x16x128u2_epilogueTile64x16_mma64x16x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c5e32724bfdbe9ac14ad6d54822778c6a1ec63772db04160719aaaed8d29f51 -size 438472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 3d70d3b63c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3339a4104298437c3628354d2b14431933314e9f2d2ecdce7b88865dccbee038 -size 418882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index d9d1236666..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x32x128u2_epilogueTile64x32_mma64x32x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f4bfa0a6a96895cc2165a02f0c83f2627cf625ff6807a5be9c6f7bdbd210428 -size 456330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 340e167a2d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac4fe726d14a219bb03ce7bd9bbdeeb5ea7e3195d64a4c1b9cf0018d53aa3df5 -size 459780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index d44d09ccfb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x64x128u2_epilogueTile64x64_mma64x64x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:de6c8769b5814f20ca34ba8df001c795dc54ae81267488bb18510fe5d334f510 -size 498016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 59566f3bab..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a8b5e0bdece2e73f72e300afab464d0680aa00b9cbcbe08903ba0210780e149 -size 425536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp deleted file mode 100644 index 44856c7481..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x128u2_epilogueTile64x8_mma64x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_dsFp8_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdb1d9d1b80de2668ddd79db5c491d3a768188b4b3f5c5d10efb335af7ff28ae -size 463034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 4328168985..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0835489e53c9690803dd1e44ec4290b6ddd9d4294bcb6c6d76b9f39680d12964 -size 304814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index e8a2a1a09b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_E4m3E4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x2_16dp256bit_splitK2_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:520f1eed6f696ceaa3b40bc534a96120a1f01b616f6a93d60606b9e72a13c3d6 -size 336502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 0a252fa5e7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp16_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a9fa4a70c46c56dbc57d38d6578b413554d5f24ee9bd9e223c9a14dca856db1 -size 504966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp deleted file mode 100644 index f5ac8a259d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_E2m1E2m1_Fp32_tile128x128x256u2_epilogueTile128x128_mma128x128x64_cluster1x1x1_16dp256bit_TN_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e36ec5821ab21cdaa67e44755daaea4896b21d8324ad95b1227cd5060ba06df -size 421164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 61c6525168..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_Fp32_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:675e4a797e20b6a606fa670c6a02d23dedc06affba9bedbe9d5eef4c9bf4fc28 -size 505954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 8d2a38bce3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/GemmKernel_MxE4m3_MxE2m1MxE4m3_Fp32_tile128x8x512u2_epilogueTile128x8_mma128x8x32_cluster1x1x1_16dp256bit_TN_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51dd2e16aae507d655121a3869b4a9db92ed11a69d4a04f56d1716743740d38a -size 516120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..77d0a99b96 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10ab1fa6850215c55485f18f841da757a1c98a10702c69c218a816b36fdac81d +size 402540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..de8d77a2b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21882f276d02f03239844b86e520ac19ca9c02c102a94d4c1d3e0455d3fbc195 +size 511260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..25972befef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a8b4624db20a64ffe4e47af05fe23dfd1b4817948313331c31fa691335442e5 +size 542788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..e1a577ebe2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa07f20606b725cb95d7c76c655b82818d06631839f0ec5665ed73485b87c3f8 +size 291096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..0a4798919c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a5dc8e158b5949c018d35bff4c3ad01d72f83df2ca286950345bc0a56b5b074 +size 388906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..1eb4c5dcbf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7754f52a7e19ccbed43cdb635195b35de68cae5dfbcabbe8426c3b9a074bbc8d +size 423592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..50a6697360 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f747338e56ab3db6a5d22ef7e869b9f6e00308583f160540460181e3e24e69d +size 405976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..f440cfd24d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e177fb082e81fefa33065c229c807924afb1c16124ecd166fdad0910bdf29971 +size 441450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..929eeb1dbd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb0c41ea0ea4277e8910fb3c6ec835e19762f864ce67d49515282316a7a07d89 +size 446034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..fcb1d856b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1cc074235b2516200058ea1b6d33d9fa50985351d6fd337d4f7255ea3786b15 +size 480720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..db8bc1be45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db41d7ec6fadab632ef27158aeb1016d6a1ba487d8cb24baba53777f19d726ce +size 414998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..100d47e3b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ca0d79c0b7c53167348509657267e4ccf29febfb2b0c929a1bad85ee8201c1 +size 449684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..e5c66f76ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82c9b1a2e6e9570f16b7e38a6570e138edf416433758da010a025550a7d1d83b +size 291990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..aa3f9888f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a30b56272d60bbef5963e35770bd929156306170b275aae47f011b42bd2bf9e +size 296100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..e8ea278b38 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14bec10a3695e2db65fb8254a52bc3db54bf76fae50bc96920e191b02eb0c2a6 +size 482720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..4604422f32 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad597736ad5068946e62334411abdf8e7f31327d31f1b7703b6b881964275ef +size 674989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..a0d62a251f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4ba8b4d74674c0ae8aed9354b28779c29be451a708ec9c6eb161591ffa3839e +size 707257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..74e13ef7a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x128x256u2_s3_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c37fc2c54f52f879c473a5461df4151bbde361fe24b9a70a558b97532beb7ba +size 304112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..4b3ae479c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e73f3bdba906c7ce17bdecc7fb02b038a8e9315fed1784c3df011245416c99fc +size 411986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..3d2ab00446 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23a610adf01983ea06f09ecbdfe36ee6116751517171bc8e130403dbb3fc68ca +size 446672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..cd77c5b4e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f3ddfdee3b39fe5c2d90c3ed85c44a33b83febb9d86793bb52235ec9f70e8b1 +size 449134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..a90de6422c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062191382dfe7352ee19677a41b812f149818e7ccff61ab8cf6818ce372d1aea +size 483820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..39631961ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c2df67fdf28e34db4a27504547a81de65590fc5dff1b6063bf8649dd2fc6df +size 525206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..f5bb5509bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740ea0a536f0047c05dbe66c91294857be694e1260e41713f1f1e0b5b37ee5ff +size 561470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..42d8202a61 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b4e8b684058a352343892a43f2ceff21b645560c36cb5ed9ef5a62b5903add +size 425992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..fe8d9e4f3c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d9a0badabc929e926cba4ad2f8f88d64225ee6501f883f8123d8d7b1938f0c5 +size 460676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..a907909b32 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c01197f1c4cab230b57a8b36a026038af238423432fce8e1fd883481d2935034 +size 289614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..4c2b2a627d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c5c5ce6f433c0a5706f0f353f4c45db650188fd1c7d07ef64b47d226a12bc1 +size 294514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..d8b2ba966a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e5b61a079c0a5a9ac8008e86155eea2261bc75bfaff0ac50cb14aec85418cd0 +size 408106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..97e546ee94 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca0c14f039c4f04866dec6567a530af9c50b6c55735a9de2be39c5061509dc91 +size 516826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..128565544d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x128u2_s4_et64x128_m64x128x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d1436f956bca72e2d0299474607d229e5fae2d2095fb230eb77a0f024b0582d +size 548354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..87c6045e55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x128x256u2_s2_et128x128_m128x128x32_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9c67d984671b707b7dbb116dbceca1e1ad9783b41654a12e86c296d74ff8ec +size 295874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..bb9cec8d50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62f39085d942f91d6345e40d5fb8ad503da5bf10d5dc7cc14bdc8f4c81e84ab8 +size 388110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..1bab11accf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x16x128u2_s4_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da82f5b1fa833552029337e9dc15065cb72b05ee9cb232aff3dcae5e9dccabe1 +size 423584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..2c819e4de0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da251f397db77fa29ad3d46b34d9cf5e26222faaa989132d4f647731b8ae93bf +size 405968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..beb2a10735 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7eec256e19ebf1224223cc00b9f1070f5f9609262a223417911c91613fa8169 +size 442232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..e2ad3277bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a657c85ce833955d14af4fe2f7d7a064b203a81eef02ba58ec72dbef02d99be3 +size 447654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..7f5dd60f15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b2a3a197d059e3127c05eff531b841cb7664ea508a297fcca167e91f8be6e43 +size 483080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..ee86d503d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c22c5844544b88950bcaa9ea162cb0d9d19d48bb33006e7d50d2f9974175d49c +size 413412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..c1e8c2d985 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b01d19683cfc3ec1d38291cff6b96f430dda1ab0d0d7666af2766a6212d6f2c +size 448886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..a3b6d9c5f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:736250708edba66db91c9f3672ab850ac53b90848c1cd6e232d064a1d9b5a930 +size 290404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..c83f197419 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e44ca67850f9c8872f061c045bbd59e99f7b76be5e1a946434a6b1c20da6af76 +size 295304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..30109803c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d67677108c5a5b7728ab83464d503b0fd54c6e5fdb46f2b1db301049b9c76ae +size 481134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..9165f1641c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_TN_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67e6daf5493c8cff97092f7c180f0dca736bc7df3a51c1fd8c537c5c8fcf65f3 +size 406676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..b74bd86aed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_Fp32_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab06bf2b980dd36301380601ad2c366dd4369270869a71e8e8b7f61f2242e77a +size 482120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp new file mode 100644 index 0000000000..fd6b119a8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/trtllmGen_gemm_export/cubins/Gemm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ddc440fa5a729425dc0f453955dda44888657f73927698c8e14cb9a01dce4e1 +size 492288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp index c5d5a18c00..25eb9cd915 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp @@ -26,14 +26,14 @@ namespace tensorrt_llm { namespace kernels { - -static gemmGatedAct::GemmGatedActInterface::ModuleCache globalTrtllmGenGemmGatedActModuleCache; +using namespace gemmGatedAct::gemmGatedAct; +static GemmGatedActInterface::ModuleCache globalTrtllmGenGemmGatedActModuleCache; TrtllmGenGemmGatedActRunner::TrtllmGenGemmGatedActRunner(TrtllmGenGemmGatedActRunnerOptions const& options_) : mOptions(options_) { // Select a GEMM kernel config to use - auto const gemm = gemmGatedAct::GemmGatedActInterface(); + auto const gemm = GemmGatedActInterface(); auto const configs = gemm.getGemmConfigs(); mPassingConfigIndices.clear(); @@ -43,7 +43,7 @@ TrtllmGenGemmGatedActRunner::TrtllmGenGemmGatedActRunner(TrtllmGenGemmGatedActRu auto const options = configs[i].mOptions; // When we include low-latency kernels we can set transposeMmaOutput via constructor - if (options.mDtypeElt == mOptions.eltType && options.mDtypeC == mOptions.outputType + if (options.mDtypeA == mOptions.eltType && options.mDtypeC == mOptions.outputType && options.mUseDeepSeekFp8 == mOptions.deepSeekFp8 && options.mTransposeMmaOutput == mOptions.transposeMmaOutput) { @@ -56,14 +56,14 @@ TrtllmGenGemmGatedActRunner::TrtllmGenGemmGatedActRunner(TrtllmGenGemmGatedActRu size_t TrtllmGenGemmGatedActRunner::getWorkspaceSizeInBytes(int32_t m, int32_t n, int32_t k) { - gemmGatedAct::GemmGatedActData gemmData; + GemmGatedActData gemmData; gemmData.mProblemDimensions.mM = mOptions.transposeMmaOutput ? n : m; gemmData.mProblemDimensions.mN = mOptions.transposeMmaOutput ? m : n; gemmData.mProblemDimensions.mK = k; selectGemmConfig(m, n, k); - auto gemm = gemmGatedAct::GemmGatedActInterface(); + auto gemm = GemmGatedActInterface(); auto const configs = gemm.getGemmConfigs(); TLLM_CHECK_WITH_INFO( mSelectedConfigIndex.has_value(), "No valid kernel found for given param config and problem size"); @@ -76,9 +76,9 @@ void TrtllmGenGemmGatedActRunner::run(int32_t m, int32_t n, int32_t k, void cons void const* b, float const* bScale, void* c, float* cScale, float* cScaleGate, void* workspace, CUstream stream, int device) { - auto gemm = gemmGatedAct::GemmGatedActInterface(); + auto gemm = GemmGatedActInterface(); - gemmGatedAct::GemmGatedActData gemmData; + GemmGatedActData gemmData; auto const configs = gemm.getGemmConfigs(); TLLM_CHECK_WITH_INFO( @@ -107,7 +107,7 @@ void TrtllmGenGemmGatedActRunner::run(int32_t m, int32_t n, int32_t k, void cons gemm.runInitBeforeWorldSync(config, gemmData, static_cast(stream)); auto const err = gemm.run(config, workspace, gemmData, static_cast(stream), multiProcessorCount, - globalTrtllmGenGemmGatedActModuleCache); + /*usePdl=*/true, globalTrtllmGenGemmGatedActModuleCache); TLLM_CHECK_WITH_INFO(err == 0, "Error occurred when running GEMM!"); } @@ -120,10 +120,10 @@ void TrtllmGenGemmGatedActRunner::run(int32_t m, int32_t n, int32_t k, void cons void TrtllmGenGemmGatedActRunner::selectGemmConfig(int32_t m, int32_t n, int32_t k) { - auto const gemm = gemmGatedAct::GemmGatedActInterface(); + auto const gemm = GemmGatedActInterface(); auto const configs = gemm.getGemmConfigs(); - gemmGatedAct::GemmGatedActData gemmData; + GemmGatedActData gemmData; // Dims gemmData.mProblemDimensions.mM = mOptions.transposeMmaOutput ? n : m; gemmData.mProblemDimensions.mN = mOptions.transposeMmaOutput ? m : n; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h index f7c30c9e0c..cbd6bada46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h @@ -28,8 +28,8 @@ namespace kernels struct TrtllmGenGemmGatedActRunnerOptions { - trtllm::gen::Dtype eltType; - trtllm::gen::Dtype outputType; + gemmGatedAct::trtllm::gen::Dtype eltType; + gemmGatedAct::trtllm::gen::Dtype outputType; bool deepSeekFp8{false}; bool transposeMmaOutput{false}; }; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h index 14c5d15b53..d1a31876f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/Enums.h @@ -18,6 +18,9 @@ #include +namespace gemmGatedAct +{ + namespace gemm { @@ -36,6 +39,31 @@ enum class AllReduceAlgo : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class MatrixLayout +{ + // K-major layout (default). [Mn, K] + MajorK = 0, + // M-major for A and N-major for B. [K, Mn] + MajorMn, + // Layout is blocked along the K dimension as seen in the diagram below. [K / blockK, Mn, blockK] + // where blockK is fixed at 128B + // + // ├────────────── K ──────────────┤ + // ┬ ┬ ├──── K block ───┤ + // │ │ │ 0 1 2 3 ║ 32 33 34 35 │ + // │ CTA0 │ 4 5 6 7 ║ 36 37 38 39 │ + // │ │ │ 8 9 10 11 ║ 40 41 42 43 │ + // │ ┴ │ 12 13 14 15 ║ 44 45 46 47 │ + // M ┬ ├────────────────║────────────────┤ + // │ │ │ 16 17 18 19 ║ 48 49 50 51 │ + // │ CTA1 │ 20 21 22 23 ║ 52 53 54 55 │ + // │ │ │ 24 25 26 27 ║ 56 57 58 59 │ + // ┴ ┴ │ 28 29 30 31 ║ 60 61 62 63 │ + BlockMajorK +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class SplitK : uint32_t { // No split-k is needed. I.e. mNumSlicesForSplitK == 1. @@ -51,6 +79,20 @@ enum class SplitK : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +enum class BiasType : uint32_t +{ + // No bias. + None = 0, + // One bias value per N of the output tensor. + M = 1, + // One bias value per row M of the output tensor. + N = 2, + // One bias value for each element of the output tensor. + Mn = 3, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class TileScheduler { // Static scheduler (Non-persistent). @@ -77,4 +119,23 @@ SPLIT_K_FUNCTION(Dsmem) //////////////////////////////////////////////////////////////////////////////////////////////////// +// Helper functions to check the Bias type. + +#define BIAS_TYPE_FUNCTION(Mode) \ + inline bool isBiasType##Mode(BiasType type) \ + { \ + return (type == BiasType::Mode); \ + } + +BIAS_TYPE_FUNCTION(None) +BIAS_TYPE_FUNCTION(N) +BIAS_TYPE_FUNCTION(M) +BIAS_TYPE_FUNCTION(Mn) + +#undef BIAS_TYPE_FUNCTION + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gemm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h index a8087dc59a..f4cd7e2ad2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActInterface.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include "GemmGatedActOptions.h" #include "KernelParams.h" @@ -29,6 +30,9 @@ namespace gemmGatedAct { +namespace gemmGatedAct +{ + //////////////////////////////////////////////////////////////////////////////////////////////////// // // GemmGatedActData @@ -51,14 +55,19 @@ struct GemmGatedActData int32_t mK{0}; // The rank id of the current device in the multi-gpu space. int32_t mRank{0}; - // The number of peer devices in tensor-parallel group. + // The number of devices in tensor-parallel group. int32_t mWorldSize{0}; }; struct InputBuffers { - // The matrix A. The data type is controlled by options.mDtypeElt. - // The shape is [M, K]. The rightmost dimension is contiguous in memory. + // The matrix A. The data type is controlled by options.mDtypeA. + // + // When layoutA is MatrixLayout::MajorK, the shape is [M, K]. + // When LayoutA is MatrixLayout::MajorMn, the shape is [K, M]. + // When LayoutA is MatrixLayout::BlockMajorK, the shape is [K / blockK, M, blockK] where blockK + // is 128B. + // The rightmost dimension is contiguous in memory. void const* mPtrA{nullptr}; // The block scaling factors to dequantize A. @@ -91,8 +100,13 @@ struct GemmGatedActData // The shape is [M] void const* mPtrPerTokenSfA{nullptr}; - // The matrix B. The data type is controlled by options.mDtypeElt. - // The shape is [N, K]. The rightmost dimension is contiguous in memory. + // The matrix B. The data type is controlled by options.mDtypeB. + // + // When layoutB is MatrixLayout::MajorK, the shape is [N, K]. + // When layoutB is MatrixLayout::MajorMn, the shape is [K, N]. + // When layoutB is MatrixLayout::BlockMajorK, the shape is [K / blockK, N, blockK] where blockK + // is 128B. + // The rightmost dimension is contiguous in memory. void const* mPtrB{nullptr}; // The scaling factors to dequantize B. @@ -132,6 +146,21 @@ struct GemmGatedActData // The shape is [N] void const* mPtrPerTokenSfB{nullptr}; + // The bias applied after the GEMM and before the activation function. + // The bias is applied before the global scaling factor. I.e. + // C = act(A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // if BiasType is N, the shape is [N] + // The bias is broadcasted along the M dimension. + // + // if BiasType is M, the shape is [M] + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* mPtrBias{nullptr}; + // The output tensor scaling factor for MxFp{4,8}, Fp8, NvFp4 and DeepSeek FP8 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. @@ -140,6 +169,43 @@ struct GemmGatedActData // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. void const* mPtrScaleGate{nullptr}; + // The alpha for SwiGlu. + // Alpha is 1.f if nullptr. + // Shape is [1]. + void const* mPtrSwiGluAlpha{nullptr}; + // The beta for SwiGlu. + // Beta is 0.f if nullptr. + // Shape is [1]. + void const* mPtrSwiGluBeta{nullptr}; + // The clamp limit before the activation. + // Clamp limit is FLT_MAX if nullptr. + // When the input is FP8 or NVFP4, the clamp has to be scaled by limit' = limit / dequantAb. + // Shape is [1]. + // + // The given clamp limit applies to the dequantized values, so the order of operations would + // look something like this: + // + // x0 = x0 * dqAb + // x0 = clamp(x0, none, limit) + // x0 = x0 * sigmoid(alpha * x0) + // x1 = dqAb * x1 + // x1 = clamp(x1, -limit, limit) + // out = qC * (x1 + beta) * x0 + // + // Given that the dqAb and qC are combined into scaleC, we can bring the dqAb into the clamp + // limit and apply the clamping prior to dequantization: + // + // x0 = clamp(x0, none, limit / dqAb) + // x0 = x0 * dqAb + // x0 = x0 * sigmoid(alpha * x0) + // x1 = clamp(x1, -limit / dqAb, limit / dqAb) + // scaleC = dqAb * qC + // beta' = beta / dqAb + // out = scaleC * (x1 + beta') * x0 + // + // Note this assumes that scaleAb == scaleGate which is true in TRT-LLM MoE use-case + // + void const* mPtrClampLimit{nullptr}; }; struct OutputBuffers @@ -190,7 +256,7 @@ public: // Launch the cubin from the provided config. It calls all necessary memsets for internal buffers. // Provided config must be validated with isValidConfig before the call. int32_t run(GemmGatedActConfig const& config, void* workspace, GemmGatedActData const& data, void* cudaStream, - int32_t multiProcessorCount, + int32_t multiProcessorCount, bool usePdl = true, std::optional> moduleCache = std::nullopt) const; // Initializes the buffers before the world sync. Must be called before run. @@ -343,8 +409,12 @@ bool GemmGatedActInterface::isValidConfig(GemmGatedActConfig const& config, Gemm //////////////////////////////////////////////////////////////////////////////////////////////////// int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* workspace, GemmGatedActData const& data, - void* cudaStream, int32_t multiProcessorCount, std::optional> moduleCache) const + void* cudaStream, int32_t multiProcessorCount, bool usePdl, + std::optional> moduleCache) const { + // Might be used. + (void) usePdl; + (void) moduleCache; // Get options from config and data. auto options = getOptionsFromConfigAndData(config, data); @@ -373,9 +443,12 @@ int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* works // Create kernel params. auto kernelParams = gemmGatedAct::KernelParams::setKernelParams(options, data.mInputBuffers.mPtrA, data.mInputBuffers.mPtrSfA, data.mInputBuffers.mPtrPerTokenSfA, data.mInputBuffers.mPtrB, - data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mOutputBuffers.mPtrC, - reinterpret_cast(data.mInputBuffers.mPtrScaleC), data.mOutputBuffers.mPtrSfC, - reinterpret_cast(data.mInputBuffers.mPtrScaleGate), reinterpret_cast(dRowMax), + data.mInputBuffers.mPtrSfB, data.mInputBuffers.mPtrPerTokenSfB, data.mInputBuffers.mPtrBias, + data.mOutputBuffers.mPtrC, reinterpret_cast(data.mInputBuffers.mPtrScaleC), + data.mOutputBuffers.mPtrSfC, reinterpret_cast(data.mInputBuffers.mPtrScaleGate), + reinterpret_cast(data.mInputBuffers.mPtrClampLimit), + reinterpret_cast(data.mInputBuffers.mPtrSwiGluAlpha), + reinterpret_cast(data.mInputBuffers.mPtrSwiGluBeta), reinterpret_cast(dRowMax), reinterpret_cast(dRowMaxBars)); // The size of the grid. @@ -395,26 +468,26 @@ int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* works #ifdef TLLM_GEN_EXPORT_INTERFACE CUmodule cuModule; CUfunction cuFunction; + if (moduleCache.has_value()) { ModuleCache& moduleCacheRef = moduleCache.value().get(); - // Modules are associated with a specific context so include the ctxId in the key + // Modules are associated with a specific context, so the context is included in the key CUcontext ctx; unsigned long long ctxId; cuCtxGetCurrent(&ctx); cuCtxGetId(ctx, &ctxId); - // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a string in decimal - // representation. + // Reinterpret the ctxId as a string to avoid needing a custom hash or converting it to a + // string in decimal representation. std::string const ctxName = std::string(reinterpret_cast(&ctxId), sizeof(unsigned long long) / sizeof(char)); std::string const funcName = std::string(config.mFunctionName); - // As the ctxName is a fixed number of bytes, the two strings can just be appended without risk of a collision auto const moduleKey = ctxName + funcName; auto module = moduleCacheRef.find(moduleKey); - // Check if module exists in cache. Otherwise, load it + // Use cache if module is found, otherwise load and insert into cache if (module != moduleCacheRef.end()) { cuFunction = std::get<1>(module->second); @@ -444,8 +517,9 @@ int32_t GemmGatedActInterface::run(GemmGatedActConfig const& config, void* works // Run the kernel. auto result = trtllm::gen::launchKernel((void*) &kernelParams, cudaStream, config.mSharedMemSize, cuFunction, block3, grid3, cluster3, - config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA - | config.mOptions.mGridWaitForPrimaryB); + usePdl + && (config.mOptions.mGridWaitForPrimaryEarlyExit | config.mOptions.mGridWaitForPrimaryA + | config.mOptions.mGridWaitForPrimaryB)); if (result != CUDA_SUCCESS) { return -1; @@ -474,3 +548,5 @@ int32_t GemmGatedActInterface::runInitBeforeWorldSync(GemmGatedActConfig const&, } // namespace gemmGatedAct //////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h index b23efd2774..a6cf385a13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmGatedActOptions.h @@ -48,6 +48,9 @@ namespace gemmGatedAct { +namespace gemmGatedAct +{ + //////////////////////////////////////////////////////////////////////////////////////////////////// namespace tg = trtllm::gen; @@ -55,8 +58,16 @@ namespace tg = trtllm::gen; // Type of the gated activation enum class ActType { - // silu(x) = x * sigmoid(x) = x * (1 / (1 + e^(-x))) - Silu = 0 + // For ActType == SwiGlu, ideally we would like to have something like + // gatedAct = scaleC * (x0 * scaleAb + beta) * ((x1 * scaleGate) * sigmoid(alpha * x1 * + // scaleGate)). + // But for now, we use the simplified version + // gatedAct = scaleC' * (x0 + beta') * ((x1 * scaleGate) * sigmoid(alpha * x1 * scaleGate)), + // where x0 and x1 are the raw numbers from Gemm, while scaleC and scaleGate are input scales, + // beta' = beta / scaleAb, scaleC' = scaleC * scaleAb. + // + // GatedSilu is a special case of SwiGlu where the alpha is 1.0 and the beta is 0.0. + SwiGlu }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -69,24 +80,38 @@ enum class ActType return (type == ActType::actType); \ } -TLLM_ACT_TYPE_FUNCTION(Silu) +TLLM_ACT_TYPE_FUNCTION(SwiGlu) #undef TLLM_ACT_TYPE_FUNCTION //////////////////////////////////////////////////////////////////////////////////////////////////// -struct GemmGatedActOptions : virtual public gemm::GemmOptions +inline std::string getActTypeName(ActType type) +{ + switch (type) + { + case ActType::SwiGlu: return "SwiGlu"; + default: return "Unknown type"; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct GemmGatedActOptions : public gemm::GemmOptions { GemmGatedActOptions() = default; - GemmGatedActOptions(gemm::GemmOptions const& options, ActType actType) + GemmGatedActOptions(gemm::GemmOptions options, ActType actType, bool clampBeforeAct) : gemm::GemmOptions(options) , mActType(actType) + , mClampBeforeAct(clampBeforeAct) { } // Type of the gated activation. - ActType mActType{ActType::Silu}; + ActType mActType{ActType::SwiGlu}; + // Clamp the dequantized values to the range [-limit, limit]. + bool mClampBeforeAct{false}; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -108,7 +133,7 @@ inline bool checkAndUpdateGemmGatedActOptions( if (options.mUseTmaStore) { - TLLM_CHECK_ERROR(hiddenEpilogueTileSize * tg::dtypeGetNumBits(options.mDtypeElt) / /* bits */ 8 % 32 == 0, + TLLM_CHECK_ERROR(hiddenEpilogueTileSize * tg::dtypeGetNumBits(options.mDtypeC) / /* bits */ 8 % 32 == 0, "Unsupported output hidden tile size"); } @@ -138,6 +163,11 @@ inline bool checkAndUpdateGemmGatedActOptions( TLLM_CHECK_ERROR(doesSplitKUseDsmem(options.mSplitK), "Split-k GMEM and GemmGatedAct are not supported yet."); } + if (gemm::isBiasTypeMn(options.mBiasType)) + { + TLLM_CHECK_ERROR(options.mTransposeMmaOutput, "Bias type Mn is not supported with not transpose mma output."); + } + return true; } @@ -148,7 +178,8 @@ inline std::string dumpOptions(GemmGatedActOptions const& options) std::stringstream ss; ss << gemm::dumpOptions(options) << ", "; ss << "mActType=" - << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")" << std::endl; + << "gemmGatedAct::ActType(" << static_cast(options.mActType) << ")," << std::endl; + ss << "mClampBeforeAct=" << options.mClampBeforeAct << "" << std::endl; return ss.str(); } @@ -169,6 +200,7 @@ struct GemmGatedActConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -190,3 +222,5 @@ struct GemmGatedActConfig #undef TLLM_LOG_INFO #undef TLLM_LOG_ERROR #endif // TLLM_GEN_EXPORT_INTERFACE + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h index 24624ee0aa..367d68b971 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/GemmOptions.h @@ -23,6 +23,7 @@ #include "KernelParams.h" #include "KernelTraits.h" #include "trtllm/gen/DtypeDecl.h" +#include "trtllm/gen/MmaDecl.h" #include "trtllm/gen/SfLayoutDecl.h" #ifndef TLLM_GEN_EXPORT_INTERFACE #include "trtllm/gen/CudaRunner.h" @@ -33,12 +34,14 @@ template void printArgs(T first, Args... args) { +#ifdef TLLM_GEN_DEBUG std::cout << first; if constexpr (sizeof...(args) > 0) { std::cout << " "; printArgs(args...); } +#endif } #define TLLM_CHECK_ERROR(cond, ...) \ @@ -65,6 +68,9 @@ void printArgs(T first, Args... args) #endif +namespace gemmGatedAct +{ + namespace gemm { @@ -83,9 +89,98 @@ struct GemmOptions virtual ~GemmOptions() = default; #endif + GemmOptions() = default; + + GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, + int clusterDimZ, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, + int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool gridTriggerSecondaryA, + bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, + bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, + MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, + bool mockAllReduce, int n, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, + int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, + bool patchF2fp, bool useShuffledMatrixA, bool sliceK, SplitK splitK, bool transposeMmaOutput, int tileM, + int tileN, int tileK, bool useUnrollLoop2xForMma, bool useCustomMmaSchedule, + bool useHoistTryWaitForCustomMmaSchedule, bool useDeepSeekFp8, bool usePerTokenSfA, bool usePerTokenSfB, + bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, + tg::SfLayout sfLayoutC, int sfReshapeFactor, TileScheduler tileScheduler) + : mAllReduceAlgo{allReduceAlgo} + , mBiasType{biasType} + , mBlockK(blockK) + , mClusterDimX{clusterDimX} + , mClusterDimY{clusterDimY} + , mClusterDimZ{clusterDimZ} + , mDtypeAcc{dtypeAcc} + , mDtypeA{dtypeA} + , mDtypeB{dtypeB} + , mDtypeC{dtypeC} + , mDtypeMmaA{dtypeMmaA} + , mDtypeMmaB{dtypeMmaB} + , mEnablesEarlyExit{enablesEarlyExit} + , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} + , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} + , mEpilogueLdtmDps{epilogueLdtmDps} + , mEpilogueLdtmBits{epilogueLdtmBits} + , mEpilogueTileM{epilogueTileM} + , mEpilogueTileN{epilogueTileN} + , mGridTriggerSecondaryA{gridTriggerSecondaryA} + , mGridTriggerSecondaryB{gridTriggerSecondaryB} + , mGridWaitForPrimaryEarlyExit{gridWaitForPrimaryEarlyExit} + , mGridWaitForPrimaryA{gridWaitForPrimaryA} + , mGridWaitForPrimaryB{gridWaitForPrimaryB} + , mHoistLoadTaskInit{hoistLoadTaskInit} + , mHoistMmaTaskTryWaits{hoistMmaTaskTryWaits} + , mK{k} + , mKernelTraits{kernelTraits} + , mLayoutA{layoutA} + , mLayoutB{layoutB} + , mM{m} + , mMmaK{mmaK} + , mMmaKind{mmaKind} + , mMmaM{mmaM} + , mMmaN{mmaN} + , mMockAllReduce{mockAllReduce} + , mN{n} + , mNumSlicesForSplitK{numSlicesForSplitK} + , mNumSlicesForSliceK{numSlicesForSliceK} + , mNumStages{numStages} + , mNumStagesMma{numStagesMma} + , mNumStagesMmaWithinWorkTile{numStagesMmaWithinWorkTile} + , mNumStagesMmaAcrossWorkTile{numStagesMmaAcrossWorkTile} + , mNumStagesWorkId{numStagesWorkId} + , mOutputDebugTensors{outputDebugTensors} + , mPatchF2fp{patchF2fp} + , mUseShuffledMatrixA{useShuffledMatrixA} + , mSliceK{sliceK} + , mSplitK{splitK} + , mTransposeMmaOutput{transposeMmaOutput} + , mTileM{tileM} + , mTileN{tileN} + , mTileK{tileK} + , mUseUnrollLoop2xForMma{useUnrollLoop2xForMma} + , mUseCustomMmaSchedule{useCustomMmaSchedule} + , mUseHoistTryWaitForCustomMmaSchedule{useHoistTryWaitForCustomMmaSchedule} + , mUseDeepSeekFp8{useDeepSeekFp8} + , mUsePerTokenSfA{usePerTokenSfA} + , mUsePerTokenSfB{usePerTokenSfB} + , mUseTmaStore{useTmaStore} + , mUseTwoTmaLoadWarps{useTwoTmaLoadWarps} + , mUseTwoMmaWarps{useTwoMmaWarps} + , mSfLayoutA{sfLayoutA} + , mSfLayoutB{sfLayoutB} + , mSfLayoutC{sfLayoutC} + , mSfReshapeFactor{sfReshapeFactor} + , mTileScheduler{tileScheduler} + { + } + // The all-reduce algorithm. AllReduceAlgo mAllReduceAlgo{AllReduceAlgo::None}; - + // The type of bias. + BiasType mBiasType{BiasType::None}; + // Block size in the K dimension + int mBlockK{-1}; // Cluster size in X dim. int mClusterDimX{1}; // Cluster size in Y dim. @@ -94,16 +189,34 @@ struct GemmOptions int mClusterDimZ{1}; // Data type of the accumulators. tg::Dtype mDtypeAcc{tg::Dtype::Fp32}; - // Data type of the inputs. - tg::Dtype mDtypeElt{tg::Dtype::Fp16}; + // Data type of the A matrix. + tg::Dtype mDtypeA{tg::Dtype::Fp16}; + // Data type of the B matrix. + tg::Dtype mDtypeB{tg::Dtype::Void}; // Data type of the outputs. tg::Dtype mDtypeC{tg::Dtype::Void}; + // Data type of the A matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaA{tg::Dtype::Void}; + // Data type of the B matrix for the MMA, if different from the input type. + tg::Dtype mDtypeMmaB{tg::Dtype::Void}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; - // Whether to enable early exit. + // Whether to enable delayed early exit to overlap + // numNonExitingCtas loading with the other instructions. bool mEnablesDelayedEarlyExit{false}; // Whether to enable the global PTX knobs for guiding the compiler optimizations. bool mEnablesGlobalPtxKnobs{true}; + // The epilogue supports multiple LDTM shapes, although not every shape is applicable in every + // case. In particular: + // - On Hopper: must be 16dp256bit. + // - Transposed output: must be 16dp256bit. + // - Non-transposed output: + // - NvFp4 with fused activation: must be 32dp32bit. + // - Else it can be either 16dp256bit or 32dp32bit. + // The number of DP lanes in the epilogue LDTM. + int mEpilogueLdtmDps{16}; + // The number of bits in the epilogue LDTM. + int mEpilogueLdtmBits{256}; // Tile size for the epilogue in M dimension. int mEpilogueTileM{128}; // Tile size for the epilogue in N dimension. @@ -118,16 +231,24 @@ struct GemmOptions bool mGridWaitForPrimaryA{true}; // Whether the load of B should wait on a grid dependency. bool mGridWaitForPrimaryB{true}; + // Whether to hoist the initialization of the loading tasks. + bool mHoistLoadTaskInit{true}; // Whether to hoist the mbarrier try_waits (e.g., mma.prodAcq, smemAb.consWait) in the MMA task. bool mHoistMmaTaskTryWaits{false}; // The K dimension of GEMM. int mK{16 * 16}; // Traits of the kernel. KernelTraits mKernelTraits{}; + // Layout of A matrix + MatrixLayout mLayoutA{MatrixLayout::MajorK}; + // Layout of B matrix + MatrixLayout mLayoutB{MatrixLayout::MajorK}; // The M dimension of GEMM. int mM{128 * 2}; // Size of the MMA instruction in the K dimension. int mMmaK{16}; + // The kind of MMA instruction to use. + tg::MmaKind mMmaKind{tg::MmaKind::Auto}; // Size of the MMA instruction in the M dimension. int mMmaM{64}; // Size of the MMA instruction in the N dimension. @@ -156,6 +277,8 @@ struct GemmOptions int mNumStagesWorkId{3}; // Whether to output debug tensors. bool mOutputDebugTensors{false}; + // Patch float conversions. + bool mPatchF2fp{false}; // Reorder rows/cols in the A matrix for the better memory accesses in the M-major epilogue. bool mUseShuffledMatrixA{false}; // Slice-K implementation to use TileM dimension for TileK. @@ -196,6 +319,12 @@ struct GemmOptions tg::SfLayout mSfLayoutB{tg::SfLayout::R128c4}; // Scale factors layout for C. tg::SfLayout mSfLayoutC{tg::SfLayout::R128c4}; + // Number of "repeats", i.e. reshaping factor, to fold hidden dimension into SfBlock dimension. + // As result, the hidden dimension of the SF tensor must be a multiple of NumRepeats * + // numEltsPerSf * 4. This reduces the problem shape space that the kernel is able to run. + // But it reduces the number of L2 requests under the hood and potentially improves perf. + // Applies to layout 8x4 only. + int mSfReshapeFactor{1}; // Tile scheduler type. TileScheduler mTileScheduler{TileScheduler::Static}; }; @@ -225,6 +354,7 @@ struct GemmConfig uint32_t const mSharedMemSize{0}; char const* mFunctionName{nullptr}; uint32_t const mNumThreadsPerCTA{0}; + char const* mHash{nullptr}; #else trtllm::gen::CudaRunner* mCudaRunner{nullptr}; #endif @@ -252,27 +382,50 @@ inline std::string toString(trtllm::gen::Dtype e) //////////////////////////////////////////////////////////////////////////////////////////////////// +template <> +inline std::string toString(trtllm::gen::MmaKind e) +{ + return trtllm::gen::mmaKindToString(e); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline std::string dumpOptions(GemmOptions const& options) { std::stringstream ss; ss << "mAllReduceAlgo=" << "gemm::AllReduceAlgo(" << static_cast(options.mAllReduceAlgo) << ")" << "," << std::endl; + ss << "mBiasType=" + << "gemm::BiasType(" << static_cast(options.mBiasType) << ")" + << "," << std::endl; + ss << "mBlockK=" << options.mBlockK << "," << std::endl; ss << "mClusterDimX=" << options.mClusterDimX << "," << std::endl; ss << "mClusterDimY=" << options.mClusterDimY << "," << std::endl; ss << "mClusterDimZ=" << options.mClusterDimZ << "," << std::endl; ss << "mDtypeAcc=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeAcc) << ")" << "," << std::endl; - ss << "mDtypeElt=" - << "trtllm::gen::Dtype(" << static_cast(options.mDtypeElt) << ")" + ss << "mDtypeA=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeA) << ")" + << "," << std::endl; + ss << "mDtypeB=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeB) << ")" << "," << std::endl; ss << "mDtypeC=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeC) << ")" << "," << std::endl; + ss << "mDtypeMmaA=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaA) << ")" + << "," << std::endl; + ss << "mDtypeMmaB=" + << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" + << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; + ss << "mEpilogueLdtmDps=" << options.mEpilogueLdtmDps << "," << std::endl; + ss << "mEpilogueLdtmBits=" << options.mEpilogueLdtmBits << "," << std::endl; ss << "mEpilogueTileM=" << options.mEpilogueTileM << "," << std::endl; ss << "mEpilogueTileN=" << options.mEpilogueTileN << "," << std::endl; ss << "mGridTriggerSecondaryA=" << options.mGridTriggerSecondaryA << "," << std::endl; @@ -280,12 +433,20 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mGridWaitForPrimaryEarlyExit=" << options.mGridWaitForPrimaryEarlyExit << "," << std::endl; ss << "mGridWaitForPrimaryA=" << options.mGridWaitForPrimaryA << "," << std::endl; ss << "mGridWaitForPrimaryB=" << options.mGridWaitForPrimaryB << "," << std::endl; + ss << "mHoistLoadTaskInit=" << options.mHoistLoadTaskInit << "," << std::endl; ss << "mHoistMmaTaskTryWaits=" << options.mHoistMmaTaskTryWaits << "," << std::endl; ss << "mK=" << options.mK << "," << std::endl; ss << "mKernelTraits={}" << "," << std::endl; + ss << "mLayoutA=gemm::MatrixLayout(" << static_cast(options.mLayoutA) << ")" + << "," << std::endl; + ss << "mLayoutB=gemm::MatrixLayout(" << static_cast(options.mLayoutB) << ")" + << "," << std::endl; ss << "mM=" << options.mM << "," << std::endl; ss << "mMmaK=" << options.mMmaK << "," << std::endl; + ss << "mMmaKind=" + << "trtllm::gen::MmaKind(" << static_cast(options.mMmaKind) << ")" + << "," << std::endl; ss << "mMmaM=" << options.mMmaM << "," << std::endl; ss << "mMmaN=" << options.mMmaN << "," << std::endl; ss << "mMockAllReduce=" << options.mMockAllReduce << "," << std::endl; @@ -298,6 +459,7 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mNumStagesMmaAcrossWorkTile=" << options.mNumStagesMmaAcrossWorkTile << "," << std::endl; ss << "mNumStagesWorkId=" << options.mNumStagesWorkId << "," << std::endl; ss << "mOutputDebugTensors=" << options.mOutputDebugTensors << "," << std::endl; + ss << "mPatchF2fp=" << options.mPatchF2fp << "," << std::endl; ss << "mUseShuffledMatrixA=" << options.mUseShuffledMatrixA << "," << std::endl; ss << "mSliceK=" << options.mSliceK << "," << std::endl; ss << "mSplitK=" @@ -325,6 +487,7 @@ inline std::string dumpOptions(GemmOptions const& options) ss << "mSfLayoutC=" << "trtllm::gen::SfLayout(" << static_cast(options.mSfLayoutC) << ")" << "," << std::endl; + ss << "mSfReshapeFactor=" << options.mSfReshapeFactor << "," << std::endl; ss << "mTileScheduler=" << "gemm::TileScheduler(" << static_cast(options.mTileScheduler) << ")" << std::endl; return ss.str(); @@ -340,6 +503,14 @@ inline T divUp(T a, T b) //////////////////////////////////////////////////////////////////////////////////////////////////// +template +inline T divUpMul(T a, T b) +{ + return gemm::divUp(a, b) * b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + inline int32_t getShuffleBlockSize(int epilogueTileM) { int shuffleBlockSize = 16; @@ -356,10 +527,136 @@ inline int32_t getShuffleBlockSize(int epilogueTileM) inline bool checkAndUpdateGemmOptions( GemmOptions& options, bool isBlackwell, int /* tpGrpSize */, bool updateOptions = true) { - if (options.mDtypeElt == tg::Dtype::E4m3 && options.mMmaK != 32) + + if (options.mDtypeB == tg::Dtype::Void) { - TLLM_LOG_WARNING( - "Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt), ". Setting MmaK to 32"); + if (updateOptions) + { + options.mDtypeB = options.mDtypeA; + } + else + { + return false; + } + } + + // If not specified, used the input dtypes as MMA dtypes (no cast required). + if (options.mDtypeMmaA == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaA = options.mDtypeA; + } + else + { + return false; + } + } + if (options.mDtypeMmaB == tg::Dtype::Void) + { + if (updateOptions) + { + options.mDtypeMmaB = options.mDtypeB; + } + else + { + return false; + } + } + + // Check that the A cast is supported. + // Currently, we only support {MxFp4, NvFp4} -> Bf16. + TLLM_CHECK_ERROR((options.mDtypeA == options.mDtypeMmaA) + || ((options.mDtypeA == tg::Dtype::MxE2m1 || options.mDtypeA == tg::Dtype::E2m1) + && options.mDtypeMmaA == tg::Dtype::Bfloat16) + || (options.mDtypeA == tg::Dtype::E2m1 && options.mDtypeMmaA == tg::Dtype::E4m3), + "Unsupported cast for A: ", tg::dtypeToString(options.mDtypeA), " -> ", tg::dtypeToString(options.mDtypeMmaA)); + + // Check that the B cast is supported. + // Currently, we only support Fp8 -> MxFp8. + // TODO: add same support for A (no transpose) + TLLM_CHECK_ERROR((options.mDtypeB == options.mDtypeMmaB) + || (options.mDtypeB == tg::Dtype::E4m3 && options.mDtypeMmaB == tg::Dtype::MxE4m3), + "Unsupported cast for B: ", tg::dtypeToString(options.mDtypeB), " -> ", tg::dtypeToString(options.mDtypeMmaB)); + + if (options.mDtypeA != options.mDtypeMmaA) + { + TLLM_CHECK_ERROR(options.mTileM == 128, "TileM must be 128 when casting the input matrix A before the MMA."); + } + + if (options.mPatchF2fp) + { + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::MxE2m1 && options.mDtypeMmaA == tg::Dtype::Bfloat16, + "PatchF2fp is only supported for MxFp4 to Bf16 casts."); + } + + // FIXME: We do not support different dtypes for A and B when not on Blackwell. + if (!isBlackwell) + { + TLLM_CHECK_ERROR( + options.mDtypeMmaA == options.mDtypeMmaB, "For non-Blackwell, A and B must have the same dtype."); + } + + // Check that the different dtypes for A and B are supported by the tensor core + // kind::f8f6f4 + if (options.mDtypeMmaA == tg::Dtype::E4m3 || options.mDtypeMmaA == tg::Dtype::E2m1) + { + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::E4m3 || options.mDtypeMmaB == tg::Dtype::E2m1, + "For dtypeMmaA = E4m3/E2m1 A, dtypeMmaB must also be E4m3/E2m1."); + } + + // kind::mxf8f6f4 + if (options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1) + { + TLLM_CHECK_ERROR(options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1, + "For dtypeMmaA = MxE4m3 or MxE2m1, dtypeMmaB must also be MxE4m3 or MxE2m1."); + } + if (options.mDtypeMmaB == tg::Dtype::MxE4m3 || options.mDtypeMmaB == tg::Dtype::MxE2m1) + { + TLLM_CHECK_ERROR(options.mDtypeMmaA == tg::Dtype::MxE4m3 || options.mDtypeMmaA == tg::Dtype::MxE2m1, + "For dtypeMmaB = MxE4m3 or MxE2m1, dtypeMmaA must also be MxE4m3 or MxE2m1."); + } + + // kind::f16 + if (options.mDtypeMmaA == tg::Dtype::Fp16 || options.mDtypeMmaA == tg::Dtype::Bfloat16) + { + TLLM_CHECK_ERROR(options.mDtypeMmaB == options.mDtypeMmaA, + "For dtypeMmaA = Fp16/Bfloat16, dtypeMmaB must be the same as dtypeMmaA."); + } + + // When one of the inputs needs to be cast, we must use two load warps. + if ((options.mDtypeMmaA != options.mDtypeA || options.mDtypeMmaB != options.mDtypeB) + && !options.mUseTwoTmaLoadWarps) + { + TLLM_LOG_WARNING("Two TMA load warps must be enabled if any of the inputs needs to be cast."); + } + + // When different dtypes are used for A and B, we must use different tiles to do the loading. + // It is not strictly required, but current implementation of SmemAb requires that. + if (options.mDtypeA != options.mDtypeB) + { + TLLM_CHECK_ERROR( + options.mUseTwoTmaLoadWarps, "Two TMA load warps must be enabled for different input types of A and B."); + } + + // Get the mma kind for the input types. + if (options.mMmaKind == tg::MmaKind::Auto) + { + if (updateOptions) + { + options.mMmaKind = dtypeGetMmaKind(options.mDtypeMmaA, options.mDtypeMmaB); + } + else + { + return false; + } + } + + if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) + && options.mMmaK != 32) + { + TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for MmaKind=", gemm::toString(options.mMmaKind), + ". Setting MmaK to 32"); if (updateOptions) { options.mMmaK = 32; @@ -371,15 +668,42 @@ inline bool checkAndUpdateGemmOptions( } } + // Check LDTM shape. + if (isBlackwell) + { + TLLM_CHECK_ERROR((options.mEpilogueLdtmDps == 16 && options.mEpilogueLdtmBits == 256) + || (options.mEpilogueLdtmDps == 32 && options.mEpilogueLdtmBits == 32), + "Unsupported LDTM shape: ", options.mEpilogueLdtmDps, "dp", options.mEpilogueLdtmBits, "bit."); + if (options.mEpilogueTileM == 64) + { + TLLM_CHECK_ERROR(options.mEpilogueLdtmDps == 16, + "Unsupported LDTM shape for epilogueTileM=64: ", options.mEpilogueLdtmDps, "dp", + options.mEpilogueLdtmBits, "bit."); + } + if (options.mTransposeMmaOutput) + { + // We can't use 32dp32bit LDTM for transposed outputs because we need each thread to own + // multiple consecutive output elements. + TLLM_CHECK_ERROR((options.mEpilogueLdtmDps == 16 && options.mEpilogueLdtmBits == 256), + "Only 16dp256bit LDTM is supported for transposed outputs."); + } + } + else + { + TLLM_CHECK_ERROR(options.mEpilogueLdtmDps == 16 && options.mEpilogueLdtmBits == 256, + "Hopper does not use TMEM. The register layout corresponds to 16dp256bit. Got ", options.mEpilogueLdtmDps, + "dp", options.mEpilogueLdtmBits, "bit."); + } + // Constraints for NvFp4 and MxFp8. - if ((options.mDtypeElt == tg::Dtype::E2m1 || options.mDtypeElt == tg::Dtype::MxE4m3 + if ((options.mMmaKind == tg::MmaKind::MxFp4NvFp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4 || options.mDtypeC == tg::Dtype::MxE4m3) && options.mMmaM != 128) { // MMA M must be 128 when the input uses block scaling, or when the output is an Mx format. int newTileM = 128 * divUp(options.mTileM, 128); - TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for dtypeElt=", gemm::toString(options.mDtypeElt), - ", dtypeC=", gemm::toString(options.mDtypeC), ". Setting MmaM to 128 and TileM to ", newTileM); + TLLM_LOG_WARNING("Unsupported MmaM (", options.mMmaM, ") for MmaKind=", gemm::toString(options.mMmaKind), + ". Setting MmaM to 128 and TileM to ", newTileM); if (updateOptions) { options.mMmaM = 128; @@ -390,18 +714,15 @@ inline bool checkAndUpdateGemmOptions( return false; } } - if (options.mDtypeElt == tg::Dtype::E2m1 || options.mDtypeElt == tg::Dtype::MxE4m3) + if (options.mMmaKind == tg::MmaKind::MxFp4NvFp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); - TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4, - "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); - - int const mmaK = (options.mDtypeElt == tg::Dtype::E2m1) ? 64 : 32; + int const mmaK = (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) ? 64 : 32; if (options.mMmaK != mmaK) { int newTileK = mmaK * divUp(options.mTileK, mmaK); - TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for ", gemm::toString(options.mDtypeElt), + TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for MmaKind=", gemm::toString(options.mMmaKind), ". Setting MmaK to ", mmaK, " and TileK to ", newTileK); if (updateOptions) { @@ -414,18 +735,56 @@ inline bool checkAndUpdateGemmOptions( } } + // The MMA N may only be smaller than 64 if it is equal to the tile N. + TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, + ") must be >= 64 or equal to TileN (", options.mTileN, ")"); + } + if (tg::dtypeIsBlockFmt(options.mDtypeA)) + { + int numEltsPerSfA = tg::dtypeNumEltsPerSf(options.mDtypeA); + TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfA) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (4 * numEltsPerSfA), " for typeA ", gemm::toString(options.mDtypeA)); + auto const numEltsPerSfAInK = options.mK / numEltsPerSfA; + TLLM_CHECK_ERROR(numEltsPerSfAInK % 4 == 0, "K dimension of scaling factors for A (", numEltsPerSfAInK, + ") must be a multiple of 4"); + } + if (tg::dtypeIsBlockFmt(options.mDtypeB)) + { + TLLM_CHECK_ERROR(options.mSfLayoutB == tg::SfLayout::R128c4 || options.mSfLayoutB == tg::SfLayout::R8c4 + || options.mSfLayoutB == tg::SfLayout::Linear, + "Only the 128x4 and 8x4 SF layouts are supported for B, got ", tg::sfLayoutToString(options.mSfLayoutB)); + // TileN must be a multiple of the number of rows per SF tile. int const numSfTileRowsB = options.mSfLayoutB == tg::SfLayout::R128c4 ? 128 : 8; TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsB == 0, "TileN (", options.mTileN, ") must be a multiple of ", numSfTileRowsB, " for B SF layout ", tg::sfLayoutToString(options.mSfLayoutB)); - // The MMA N may only be smaller than 64 if it is equal to the tile N. - TLLM_CHECK_ERROR(options.mMmaN >= 64 || options.mMmaN == options.mTileN, "MmaN (", options.mMmaN, - ") must be >= 64 or equal to TileN (", options.mTileN, ") for ", gemm::toString(options.mDtypeElt)); - int numEltsPerSf = tg::dtypeNumEltsPerSf(options.mDtypeElt); - TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSf) == 0, "TileK (", options.mTileK, - ") must be a multiple of ", (4 * numEltsPerSf), " for type ", gemm::toString(options.mDtypeElt)); + int numEltsPerSfB = tg::dtypeNumEltsPerSf(options.mDtypeB); + TLLM_CHECK_ERROR(options.mTileK % (4 * numEltsPerSfB) == 0, "TileK (", options.mTileK, + ") must be a multiple of ", (4 * numEltsPerSfB), " for typeB ", gemm::toString(options.mDtypeB)); + auto const numEltsPerSfBInK = options.mK / numEltsPerSfB; + TLLM_CHECK_ERROR(numEltsPerSfBInK % 4 == 0, "K dimension of scaling factors for B (", numEltsPerSfBInK, + ") must be a multiple of 4"); } + + int32_t padMultiplierA = 1; + int32_t padMultiplierB = 1; + if (options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) + { + if (options.mDtypeA == tg::Dtype::MxE2m1) + { + padMultiplierA = 2; + } + if (options.mDtypeB == tg::Dtype::MxE2m1) + { + padMultiplierB = 2; + } + } + TLLM_CHECK_ERROR((padMultiplierA * tg::dtypeGetNumBits(options.mDtypeA) * options.mK / 8) % 16 == 0, + "K dimension of A must be aligned to 16 bytes."); + TLLM_CHECK_ERROR((padMultiplierB * tg::dtypeGetNumBits(options.mDtypeB) * options.mK / 8) % 16 == 0, + "K dimension of B must be aligned to 16 bytes."); + if (options.mDtypeC == tg::Dtype::E2m1 || options.mDtypeC == tg::Dtype::MxE4m3) { TLLM_CHECK_ERROR(isBlackwell, "Block scaling is only supported on Blackwell"); @@ -433,8 +792,10 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mSfLayoutC == tg::SfLayout::R128c4 || options.mSfLayoutC == tg::SfLayout::R8c4, "Only the 128x4 and 8x4 SF layouts are supported for C."); int const numSfTileRowsC = options.mSfLayoutC == tg::SfLayout::R128c4 ? 128 : 8; - TLLM_CHECK_ERROR(options.mTileN % numSfTileRowsC == 0, "TileN (", options.mTileN, ") must be a multiple of ", - numSfTileRowsC, " for C SF layout ", tg::sfLayoutToString(options.mSfLayoutC)); + int const tileTokenDim = options.mTransposeMmaOutput ? options.mTileN : options.mTileM; + TLLM_CHECK_ERROR_FMT(tileTokenDim % numSfTileRowsC == 0, + "Tile%s (%d) must be a multiple of %d for C SF layout %s", options.mTransposeMmaOutput ? "N" : "M", + tileTokenDim, numSfTileRowsC, tg::sfLayoutToString(options.mSfLayoutC).c_str()); int const hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; int const hiddenGranularity = 4 * tg::dtypeNumEltsPerSf(options.mDtypeC); @@ -447,10 +808,10 @@ inline bool checkAndUpdateGemmOptions( // If dtypeC is unspecified (Dtype::Void), assign to the input dtype. if (options.mDtypeC == tg::Dtype::Void) { - TLLM_LOG_INFO("Setting dtypeC to ", tg::dtypeToString(options.mDtypeElt)); + TLLM_LOG_INFO("Setting dtypeC to ", tg::dtypeToString(options.mDtypeA)); if (updateOptions) { - options.mDtypeC = options.mDtypeElt; + options.mDtypeC = options.mDtypeA; } else { @@ -518,10 +879,6 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mM > 0 && options.mN > 0 && options.mK > 0, "M, N and K must be larger than 0"); TLLM_CHECK_ERROR(options.mNumSlicesForSplitK > 0, "Split K must be larger than 0."); - TLLM_CHECK_ERROR(options.mK % options.mNumSlicesForSplitK == 0, "K must be divisible by NumSlicesForSplitK."); - TLLM_CHECK_ERROR((options.mK / options.mNumSlicesForSplitK) % options.mTileK == 0, - "K / NumSlicesForSplitK must be divisible by TileK. Found TileK=", options.mTileK, " and K=", options.mK, - " and NumSlicesForSplitK=", options.mNumSlicesForSplitK); if (options.mUseShuffledMatrixA) { @@ -530,8 +887,10 @@ inline bool checkAndUpdateGemmOptions( shuffleBlockSize, ") when useShuffledMatrixA"); } - TLLM_CHECK_ERROR(options.mMmaM <= options.mEpilogueTileM && options.mMmaN <= options.mEpilogueTileN, - "EpilogueTileM and EpilogueTileN must be larger or equal than the respective atom sizes."); + if (!options.mSliceK) + { + TLLM_CHECK_ERROR(options.mMmaM <= options.mEpilogueTileM, "EpilogueTileM must be larger or equal than mmaM."); + } TLLM_CHECK_ERROR(options.mTileM % options.mEpilogueTileM == 0 && options.mTileN % options.mEpilogueTileN == 0, "TileM and TileN must be divisible by EpilogueTileM and EpilogueTileN respectively."); TLLM_CHECK_ERROR( @@ -677,19 +1036,25 @@ inline bool checkAndUpdateGemmOptions( { TLLM_CHECK_ERROR( options.mNumStagesMmaWithinWorkTile == 1, "Non-DeepSeekFp8 requires numStagesMmaWithinWorkTile == 1"); + if (options.mNumStagesMma > 1) + { + TLLM_CHECK_ERROR(options.mTileScheduler == TileScheduler::Persistent, + "Non-DeepSeekFp8 requires persistent scheduler when using numStagesMma >1"); + } } if (options.mUseDeepSeekFp8) { - TLLM_CHECK_ERROR(options.mDtypeElt == tg::Dtype::E4m3, "A and B dtype must be E4m3 for DeepSeek Fp8. Found ", - tg::dtypeToString(options.mDtypeElt)); + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, + "A and B dtype must be E4m3 for DeepSeek Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), + " dtypeB=", tg::dtypeToString(options.mDtypeB)); TLLM_CHECK_ERROR(isBlackwell, "DeepSeek Fp8 is not supported for Hopper"); TLLM_CHECK_ERROR(options.mAllReduceAlgo == AllReduceAlgo::None, "DeepSeek Fp8 does not support AllReduce"); // Check that TileK = 128 for correct scaling of every 128 channels. TLLM_CHECK_ERROR(options.mTileK == 128, "Tile-K must be equal to 128 for DeepSeek Fp8"); + TLLM_CHECK_ERROR(options.mK % options.mTileK == 0, "K must be a multiple of TileK"); // Tile sizes of the output hidden dimension. - auto hiddenDim = options.mTransposeMmaOutput ? options.mM : options.mN; auto hiddenDimPerOutputTile = options.mTransposeMmaOutput ? options.mTileM : options.mTileN; auto hiddenDimPerEpilogueTile = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; auto hiddenDimPerMma = options.mTransposeMmaOutput ? options.mMmaM : options.mMmaN; @@ -702,9 +1067,6 @@ inline bool checkAndUpdateGemmOptions( // Use two MMA warps to reduce mbar trywait latency. TODO: enable by default for deepseek. // options.mUseTwoMmaWarps = true; - // Make sure the GEMM-M/N dimension is a multiple of 128 when using DeepSeek FP8. - TLLM_CHECK_ERROR(hiddenDim % 128 == 0, "GEMM-", hiddenDimName, - " must be a multiple of 128 when using DeepSeek Fp8. Found ", hiddenDim); // Make sure the GEMM-K dimension is a multiple of 128 when using DeepSeek FP8. TLLM_CHECK_ERROR( options.mK % 128 == 0, "GEMM-K must be a multiple of 128 when using DeepSeek Fp8. Found ", options.mK); @@ -732,25 +1094,32 @@ inline bool checkAndUpdateGemmOptions( TLLM_CHECK_ERROR(options.mTileN == options.mEpilogueTileN, "TileN must be equal to EpilogueTileN for slice-K"); TLLM_LOG_WARNING("Overwriting TileM and EpilogueTileM to 32 for slice-K"); - if (updateOptions) + if (options.mTileM != 32 || options.mEpilogueTileM != 32) { - // FIXME: it is possible to remove this restriction. - options.mTileM = 32; - options.mEpilogueTileM = 32; + if (updateOptions) + { + // FIXME: it is possible to remove this restriction. + options.mTileM = 32; + options.mEpilogueTileM = 32; + } + else + { + return false; + } } - else - { - return false; - } - TLLM_CHECK_ERROR(options.mDtypeElt == tg::Dtype::E4m3, "Slice-K requires e4m3 input dtype"); + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, + "Slice-K requires e4m3 input dtype"); - if (updateOptions) + if (options.mNumSlicesForSliceK != 4) { - options.mNumSlicesForSliceK = 4; - } - else - { - return false; + if (updateOptions) + { + options.mNumSlicesForSliceK = 4; + } + else + { + return false; + } } TLLM_CHECK_ERROR((options.mTileK / options.mMmaK) % options.mNumSlicesForSliceK == 0, "TileK (", options.mTileK, ") / MmaK (", options.mMmaK, ") must be a multiple of mNumSlicesForSliceK (", options.mNumSlicesForSliceK, @@ -759,14 +1128,22 @@ inline bool checkAndUpdateGemmOptions( if (options.mUseUnrollLoop2xForMma) { - bool notSupported = (options.mK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; - // Check that the 2*TileK is a multiple of MmaK when UnrollLoop2x is enabled. - // This is to avoid deadlock when mma runs even-numbered loop while the other warps run - // odd-numbered loop. + // Number of iterations in K dimension after padding. + // Note the perCtaK in each CTA in the splitK group are padded to the same number of iterations. + // E.g., K = 512, TileK = 128, numSlicesForSplitK = 3. Then the padded K is + // + // ceil(512 / (128*3)) * (128*3) = 768 + // + int paddedK = divUpMul(options.mK, options.mTileK * options.mNumSlicesForSplitK); + // Check that the padded K (K rounded to next multiple of tileK) is a multiple of 2*TileK when + // UnrollLoop2x is enabled. This is to avoid deadlock when mma runs even-numbered loop while the + // other warps run odd-numbered loop. + // + bool notSupported = (paddedK / options.mNumSlicesForSplitK) % (options.mTileK * 2) != 0; if (notSupported) { TLLM_LOG_WARNING("Size K / splitK must be a multiple of TileK * 2. Found TileK=", options.mTileK, - " and K=", options.mK, " and numSlicesForSplitK=", options.mNumSlicesForSplitK, + " and K=", options.mK, " (paddedK=", paddedK, ") and numSlicesForSplitK=", options.mNumSlicesForSplitK, ". Disabling unrollLoop2xForMma."); if (updateOptions) { @@ -821,23 +1198,108 @@ inline bool checkAndUpdateGemmOptions( // // Kernel 1: ----PREEXIT-----------FLUSH // Kernel 2: -------PREEXIT----ACQBULK---FLUSH - // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible ----------------------- - // Warp 1: ---- (!) We normally assume that 1 is visible is not yet visible- - // Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible ---------- + // Kernel 3: Warp 0: ---- (!) Output of 1,2 is not yet visible + // ----------------------- + // Warp 1: ---- (!) We normally assume that 1 is visible is not yet + // visible- Warp 2: -------------------ACQBULK-- Kernel 1,2 output visible + // ---------- TLLM_CHECK_ERROR((options.mGridWaitForPrimaryA || !options.mGridTriggerSecondaryA), "A: If a task triggers a secondary kernel, it must also wait for primary kernel."); TLLM_CHECK_ERROR((options.mGridWaitForPrimaryB || !options.mGridTriggerSecondaryB), "B: If a task triggers a secondary kernel, it must also wait for primary kernel."); + if (options.mUsePerTokenSfA || options.mUsePerTokenSfB) + { + // Checks applicable to both MetaFP8 and RoutingScalesOnInput + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "DeepSeek FP8 and per-token scaling are not compatible"); + TLLM_CHECK_ERROR(isBlackwell, "Per-token scaling is not supported for Hopper"); + if (options.mUsePerTokenSfA && options.mUsePerTokenSfB) + { + // MetaFP8 case + TLLM_CHECK_ERROR(options.mDtypeA == tg::Dtype::E4m3 && options.mDtypeB == tg::Dtype::E4m3, + "A and B dtype must be E4m3 for Meta Fp8. Found dtypeA=", tg::dtypeToString(options.mDtypeA), + " dtypeB=", tg::dtypeToString(options.mDtypeB)); + } + else + { + // RoutingScalesOnInput case + TLLM_CHECK_ERROR((options.mUsePerTokenSfA && !options.mTransposeMmaOutput) + || (options.mUsePerTokenSfB && options.mTransposeMmaOutput), + "In RoutingScalesOnInput mode, perToken scales must be used on activations"); + } + } + + // The generation should support non K-major layouts for both A and B; however, it is unclear if + // there is a use-case + TLLM_CHECK_ERROR((options.mLayoutA == MatrixLayout::MajorK) || (options.mLayoutB == MatrixLayout::MajorK), + "At least one matrix must be in k-major layout"); + + // Some features are currently only support when both matrices are in K-major format + if (options.mLayoutB != MatrixLayout::MajorK || options.mLayoutB != MatrixLayout::MajorK) + { + TLLM_CHECK_ERROR(isBlackwell, "Non K-major layouts are only supported on Blackwell"); + TLLM_CHECK_ERROR(options.mSplitK == SplitK::None, "Non K-major layouts do not support split K"); + } + if (options.mLayoutA == MatrixLayout::MajorMn) + { + TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeA) >= 8, "Subbyte types only support K major layout"); + } + if (options.mLayoutB == MatrixLayout::MajorMn) + { + TLLM_CHECK_ERROR(tg::dtypeGetNumBits(options.mDtypeB) >= 8, "Subbyte types only support K major layout"); + } + + if ((options.mLayoutA == MatrixLayout::BlockMajorK) || (options.mLayoutB == MatrixLayout::BlockMajorK)) + { + bool const isBlockA = options.mLayoutA == MatrixLayout::BlockMajorK; + + // Block K size must be 128B. + // TODO Leaving this as an option for now in case we want to expertiment with other block sizes + // As the user is not expected to set this, do not fail if updateOptions is false + int32_t const elemSizeInBits + = (isBlockA) ? tg::dtypeGetNumBits(options.mDtypeA) : tg::dtypeGetNumBits(options.mDtypeB); + int32_t const elemsIn128B = 128 * 8 /* Bits in byte */ / elemSizeInBits; + + if (options.mBlockK != elemsIn128B) + { + if (updateOptions) + { + options.mBlockK = elemsIn128B; + } + else + { + return false; + } + } + + if (options.mBlockK > options.mTileK) + { + TLLM_CHECK_ERROR(options.mBlockK % options.mTileK == 0, + "If block size is greater than tile size, block size must be a multiple of tile size"); + } + else if (options.mBlockK < options.mTileK) + { + TLLM_CHECK_ERROR(options.mTileK % options.mBlockK == 0, + "If tile size is greater than block size, tile size must be a multiple of block size"); + } + } + + if (!isBiasTypeNone(options.mBiasType)) + { + TLLM_CHECK_ERROR(!isBiasTypeMn(options.mBiasType), "BiasType::Mn is not supported"); + TLLM_CHECK_ERROR(!options.mUseDeepSeekFp8, "Bias is not supported for DeepSeek Fp8"); + TLLM_CHECK_ERROR(!(options.mUsePerTokenSfA && options.mUsePerTokenSfB), "Bias is not supported for Meta Fp8"); + } + if (updateOptions) { // Init kernel traits. - options.mKernelTraits = KernelTraits(options.mDtypeElt, options.mDtypeC, options.mDtypeAcc, options.mTileM, - options.mTileN, options.mTileK, options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, - options.mNumStagesMma, options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, - options.mUseTmaStore, options.mTransposeMmaOutput, options.mAllReduceAlgo, - options.mTileScheduler == TileScheduler::Persistent, options.mUseDeepSeekFp8, options.mUsePerTokenSfA, - options.mUsePerTokenSfB); + options.mKernelTraits = KernelTraits(options.mDtypeA, options.mDtypeB, options.mDtypeC, options.mDtypeAcc, + options.mDtypeMmaA, options.mDtypeMmaB, options.mMmaKind, options.mTileM, options.mTileN, options.mTileK, + options.mEpilogueTileM, options.mEpilogueTileN, options.mNumStages, options.mNumStagesMma, + options.mNumSlicesForSplitK, options.mNumSlicesForSliceK, options.mSplitK, options.mUseTmaStore, + options.mTransposeMmaOutput, options.mAllReduceAlgo, options.mTileScheduler == TileScheduler::Persistent, + options.mUseDeepSeekFp8, options.mUsePerTokenSfA, options.mUsePerTokenSfB, options.mBiasType); } return true; @@ -857,3 +1319,5 @@ inline bool checkAndUpdateGemmOptions( #undef TLLM_LOG_ERROR #endif // TLLM_GEN_EXPORT_INTERFACE + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h index e4f9b89c93..d953deb571 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelMetaInfo.h @@ -19,92 +19,249 @@ #include "GemmGatedActOptions.h" +namespace gemmGatedAct +{ + namespace tensorrt_llm { namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "23d32a5" -#define TLLM_GEN_EXPORT_VERSION "0.0" +#define TLLM_GEN_COMMIT "32110ebf-dirty" +#define TLLM_GEN_EXPORT_VERSION "7.0.3.0" -static constexpr size_t tllmGenGemmGatedActListLen = 13; +static constexpr size_t tllmGenGemmGatedActListLen = 84; #ifndef EXCLUDE_SM_100 -extern unsigned char GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; -extern unsigned char GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin[]; +extern unsigned char GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 -extern unsigned int GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; -extern unsigned int GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len; +extern unsigned int GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len; #endif // EXCLUDE_SM_100 static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { #ifndef EXCLUDE_SM_100 -{GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 136192, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "5b67e4f9514141995486e4b8c33f0f885552b26c3fe8cc3e1389136761ac4094", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(17826819) +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 -, /* mMmaN */ 8 +, /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 -, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 3 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) +, /* mSplitK */ gemm::SplitK(0) , /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 -, /* mTileN */ 8 +, /* mTileN */ 128 , /* mTileK */ 256 , /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 @@ -118,47 +275,61 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len, 168960, "gemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 136192, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "4315ac2b0d090711080eb399c209955f6963f7ad05a12b6978fab9d88e874477", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileN */ 64 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 +, /* mNumStages */ 3 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 0 +, /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 , /* mTileK */ 256 @@ -172,45 +343,479 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 112640, "gemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 136192, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "bf2d1b36035114d6193e72943740e99d7aad417cdea0c30e7a3e69c374255e94", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 16 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 136192, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "29494acc8be01db6fe7c482c3b50aa0e14b3dd4feabb8fc448c9f95f947517f0", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 154624, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "9efcb65bb29b98c06f81a6359fc37a14ecaa774eaf5c6eeb006459994c0612bf", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 154624, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "99792d201f89a353a375d5d18fed42a144371e46b32783e3e66e232d1b554ea0", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 162816, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "4e60f235d519d3ee892b07b7ee21b13b933e629668c88c1de7571aa1843dbae7", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 162816, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "364051e633dcd5bc7255cb13323ebabce66d9accb01e4693dc84a06379791bda", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 126976, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "de187182b79ffa8209138ec7d14f30bb8d1f47cee37ea16e7ba790673c40684e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 6 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -218,7 +823,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -228,101 +833,59 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 110592, "gemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 124928, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "1d175d0142c0236322ac25c476b1bf9ba915a53416bdbb5680bfe08432313197", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) , /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 4 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(17826819) -, /* mDtypeC */ trtllm::gen::Dtype(17826819) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 6 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -342,31 +905,464 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len, 218112, "gemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 126976, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "99fd786c8db39de59dc6fe34973afb9ce5dd044b90008105697df87fc24d8bdf", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) -, /* mDtypeC */ trtllm::gen::Dtype(1050630) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 124928, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "2f3b6546de54fe852ea5d47faa55561623b983c2059972e995d409e5f6ad1ac9", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 205824, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "49c6f6729366a9e619ebcd50f931b9d8aae1fde98e05ea952480c8d7a8258b2c", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 203776, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "0ba3a788108f96705971e8ecba7dd97c55be2e79918fde8086706c7942d36dac", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 203776, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "0ba3a788108f96705971e8ecba7dd97c55be2e79918fde8086706c7942d36dac", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 205824, "gemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "f1706cfa1f7cfcf07c98036f53efba2503beea1d3c1ba87d423bfcde6ea1ba04", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 121856, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "3dd66b74165ec2840acec6d7d5f6598943759bff9d3df10b9b1852cc9d7d6bf9", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 @@ -379,10 +1375,921 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 0 +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 121856, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "cfb5a79df17aac6a15adae18b9c323056eb351b1fab1b961eded6762beb98485", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 194560, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "9bab7ba6449fe2ac4f3cd9835f8f3c064af150c4158923b7dfb8d98436a83d93", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 194560, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "1e3f20bbc07c40b3c2a9b59398317cb9697ff2ec56f4f713cdc6fa51d94f9d8e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 219136, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "6f5263481c50465513910742b56f76224ae7f1167ec3900a5695fb76b50e1093", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 219136, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "39e629d2222bee2438a0429c5b358334635600987d8bb71b3e16a6f68d22d68f", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 169984, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "d884002e16cdcf3fe600386c4bb98033d12bd88d695495923af2f9e746eda1a6", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 169984, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "3afa69de26495aac2c51fb738efb3be9a56043e9eb28aa26ec4c8a1011d9e238", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 217088, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "730666bd70b9cb9798960f75b14ee32150dce70f93b183889339af1bc935d52a", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "7caa1bc595167e16fae6b3dc7c065b804cdf24bf9860a9a5f05fa881c05eb1fb", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 217088, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "093afd94b8aa7d3c9bfbbb37449fb1ae7b377a4932180bd1feb3208f519132d0", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "5c244b6342b61b29d97519a151f28b1fd26d2ef67a1eb1871f9c61fc260f8361", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1052672) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 128000, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "01d98ed8c0fba4621e78191f7c6fd93c4821e25ee7527bbc5c83c2a9c3368852", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 128000, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "5fd38415175c2492d221bff2f008d356849b2ac5f936bc592e00b61c76dd5edd", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 , /* mTileK */ 256 @@ -396,45 +2303,479 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 111616, "gemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 134144, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "e0ffe2a3ef05cc27c1d4867049431b58186e20ac7df5e5e51de3f7bd746f8fb2", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) -, /* mDtypeC */ trtllm::gen::Dtype(1050630) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 16 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 134144, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "c6a6eadb7e90bda398f45614f6d82350120b6b05abef1d5c5a874f7ffe99ebb2", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 150528, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "5a9feacd1dc9854de7c2837582857525834e086eab8e688dfb8e215c745e3220", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 150528, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "ea8dff090f1cd624bb0e8c17f08c9964b00b38641a8faa674610ae13172e3b1e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 154624, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "8cb3c535c2168ae2cad0743c3aa9539471443c6a289cea713111bfa632de7934", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 154624, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "c93896ae65c029c4ec7dfc8f6e0763e999ef15daad4f3bea7d36fa5157c49d8e", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 125952, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "af7781853572012961cea76b7e2a082d6d9342e43b31abe297a81db85b06d813", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 6 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -442,7 +2783,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -452,101 +2793,59 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 110592, "gemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 124928, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "87fac270b79941879e5c23a8e8f59e5f1d96696f96d1ef0b3cd14fad5ffe5dba", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) -, /* mDtypeC */ trtllm::gen::Dtype(1050630) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 4 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(17826819) -, /* mDtypeC */ trtllm::gen::Dtype(1052680) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 6 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -566,47 +2865,1391 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin_len, 168960, "gemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 125952, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "c4a819db82a83df03e82c399c6fe741f068bbac46835eee69a358f99a29b7c1b", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) -, /* mDtypeC */ trtllm::gen::Dtype(1052680) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 128 +, /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 , /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 , /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 124928, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "d1a0af495d8108a7ae3ec48e87326181a93bb91ab901f662b8992f551c048c00", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 204800, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "4c1d3d6fd24e8ff2d04ce69c9c225b3e77b6dcb4fb977ee3717cd2f04f0166a1", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 203776, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "27c9fec59bbe27e81111633837e72b2962ddf18f96194351bc9442acbc2343f3", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 203776, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "27c9fec59bbe27e81111633837e72b2962ddf18f96194351bc9442acbc2343f3", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 204800, "gemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "9b607b46dc0fb419b03807ce73d366a83d5a55352df0dba94c31d03fd9f06610", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 113664, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "ec9a5dbd34f3ad0a17231641211733d0a1b69f11331f2b26ab9c2c00df542cae", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) , /* mMmaM */ 128 , /* mMmaN */ 128 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 2 +, /* mNumStages */ 3 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) -, /* mTransposeMmaOutput */ 0 +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 113664, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "732c9c63ba7a5bc1fd15ab63e71cc601c2cbee4671eb804ddcc523a093bd0ee7", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 192512, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "926cb38fb683ac2b8ad5d3b55c88e1535ebd0e6669fe060f4a2f2bab56e64bf8", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 192512, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "ae0b7ac83d2d4b5711c7dfb0261ffd459714372075e26140d61df4b0c101805d", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "1792b7a0e51fc1b15733d3c4d5e88436b71b43dcb303a762ea8d67365c514c43", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "be83e8927e5f0d7116881e625370ff27e9bd32dad586c6be5b19148a324b0526", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 161792, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "a20f89c5892265e6f3294aa2b80d7058e556817b5dbf083cddeae39d73b668d6", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 161792, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "33a720803a9e091a306038f0bafaf1f6da9377695e312269ef47ce2848ead809", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 216064, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "204f40d2ec61881fdbdd1a8b302d18e2e34e2b3a92401c0613c849d7550245bf", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "c4e3e3dd484dac49034359fa74db1895e1c1f8b18e16a5c8d2a80d9065f1bf8a", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 216064, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "0e1ee1d570e3501a7ae4e28ad518e73362814e871bd0c99eb16b1009db9dc255", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "c39a61a71fe1c363ca8fb35c0a44201e226e2a37bbc9340b30d8693f6a2801a1", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 152576, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "116e2251afff6fc4b35e203e20e67fec51fea0bf080924d9f372d01e8c09dca9", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 152576, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "2c6676fa241d1cfa228d38d752455367f406b6fdf63e6e743fcfb601d6cad4d6", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 , /* mTileM */ 128 , /* mTileN */ 128 , /* mTileK */ 256 @@ -620,45 +4263,479 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin_len, 112640, "gemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 140288, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "757616ca0f06ae3e788e03ef2eb0937d9e74f8f8253788e23eb4c1030c36b0c7", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 1 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) -, /* mDtypeC */ trtllm::gen::Dtype(1052680) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 +, /* mEpilogueTileN */ 16 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 -, /* mMmaK */ 32 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 140288, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "e588cb33ab99116fbcfdb2b17151573ebf82617182918d175415aa6f4cf98aa1", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 162816, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "d2d4ae0b52ea4f1d1f2a17a3279c0c8d71dc74ddd413046ffd1ed409b3cf80c0", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 162816, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "d734b1b43833a67d9366198b0f92b464d9693fbe3e6f133de48c17b321ad7076", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 179200, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "ea7fc2413963783f4c0c8e20e50916cf28c601a20e79bc8058f8da59322e3712", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 179200, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "7e28fe19e2cf656d223f81e393bb90a6ba39198c807e3c8f0192e5eb4dd5c514", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 129024, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "a2860fc7708cea6ce0896cd07f98ca232f6a6d6f7daa29007968a9fcd44b4121", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 1 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 +, /* mNumStages */ 6 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(0) @@ -666,7 +4743,7 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mTileM */ 128 , /* mTileN */ 8 , /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseUnrollLoop2xForMma */ 0 , /* mUseCustomMmaSchedule */ 1 , /* mUseHoistTryWaitForCustomMmaSchedule */ 0 , /* mUseDeepSeekFp8 */ 0 @@ -676,101 +4753,59 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mUseTwoTmaLoadWarps */ 1 , /* mUseTwoMmaWarps */ 0 , /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 110592, "gemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 224, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 124928, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "fd5b2b93a53f2e998abadc4a3b4cb4204610606a38656b9712af9764e73a2736", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 , /* mClusterDimX */ 1 , /* mClusterDimY */ 1 , /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(1050630) -, /* mDtypeC */ trtllm::gen::Dtype(1052680) +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 , /* mEpilogueTileM */ 128 , /* mEpilogueTileN */ 8 , /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 +, /* mGridTriggerSecondaryB */ 0 , /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 -, /* mGridWaitForPrimaryB */ 1 -, /* mHoistMmaTaskTryWaits */ 0 -, /* mK */ 2048 -, /* mKernelTraits */ {} -, /* mM */ 256 -, /* mMmaK */ 32 -, /* mMmaM */ 128 -, /* mMmaN */ 8 -, /* mMockAllReduce */ 0 -, /* mN */ 256 -, /* mNumSlicesForSplitK */ 4 -, /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 3 -, /* mNumStagesMma */ 1 -, /* mNumStagesMmaWithinWorkTile */ 1 -, /* mNumStagesMmaAcrossWorkTile */ 1 -, /* mNumStagesWorkId */ 3 -, /* mOutputDebugTensors */ 0 -, /* mUseShuffledMatrixA */ 1 -, /* mSliceK */ 0 -, /* mSplitK */ gemm::SplitK(2) -, /* mTransposeMmaOutput */ 1 -, /* mTileM */ 128 -, /* mTileN */ 8 -, /* mTileK */ 256 -, /* mUseUnrollLoop2xForMma */ 1 -, /* mUseCustomMmaSchedule */ 1 -, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 -, /* mUseDeepSeekFp8 */ 0 -, /* mUsePerTokenSfA */ 0 -, /* mUsePerTokenSfB */ 0 -, /* mUseTmaStore */ 1 -, /* mUseTwoTmaLoadWarps */ 1 -, /* mUseTwoMmaWarps */ 0 -, /* mSfLayoutA */ trtllm::gen::SfLayout(3) -, /* mSfLayoutB */ trtllm::gen::SfLayout(3) -, /* mSfLayoutC */ trtllm::gen::SfLayout(3) -, /* mTileScheduler */ gemm::TileScheduler(0) - }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, -{GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin_len, 86016, "gemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a", 448, {{ /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) -, /* mClusterDimX */ 1 -, /* mClusterDimY */ 1 -, /* mClusterDimZ */ 4 -, /* mDtypeAcc */ trtllm::gen::Dtype(1056777) -, /* mDtypeElt */ trtllm::gen::Dtype(17826819) -, /* mDtypeC */ trtllm::gen::Dtype(1056777) -, /* mEnablesEarlyExit */ 0 -, /* mEnablesDelayedEarlyExit */ 0 -, /* mEnablesGlobalPtxKnobs */ 1 -, /* mEpilogueTileM */ 128 -, /* mEpilogueTileN */ 8 -, /* mGridTriggerSecondaryA */ 0 -, /* mGridTriggerSecondaryB */ 1 -, /* mGridWaitForPrimaryEarlyExit */ 1 -, /* mGridWaitForPrimaryA */ 0 +, /* mGridWaitForPrimaryA */ 1 , /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 , /* mHoistMmaTaskTryWaits */ 0 , /* mK */ 2048 , /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) , /* mM */ 256 , /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) , /* mMmaM */ 128 , /* mMmaN */ 8 , /* mMockAllReduce */ 0 , /* mN */ 256 , /* mNumSlicesForSplitK */ 4 , /* mNumSlicesForSliceK */ 1 -, /* mNumStages */ 4 +, /* mNumStages */ 6 , /* mNumStagesMma */ 1 , /* mNumStagesMmaWithinWorkTile */ 1 , /* mNumStagesMmaAcrossWorkTile */ 1 , /* mNumStagesWorkId */ 3 , /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 , /* mUseShuffledMatrixA */ 1 , /* mSliceK */ 0 , /* mSplitK */ gemm::SplitK(2) @@ -790,11 +4825,1274 @@ static const gemmGatedAct::GemmGatedActConfig tllmGenGemmGatedActList[] = { , /* mSfLayoutA */ trtllm::gen::SfLayout(3) , /* mSfLayoutB */ trtllm::gen::SfLayout(1) , /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 , /* mTileScheduler */ gemm::TileScheduler(0) }, /* mActType */ gemmGatedAct::ActType(0) - }, gemm::SmVersion::Sm100a }, +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 129024, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "8b4191af5a1a18dcadebbfc266c942d5957a372b487eb959cbb7dd8ab8f36dcf", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 124928, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "e8a9038cc0ced071eb9386a64e9322752f8b9552d2c897b925ca8540312b85b8", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 207872, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "0cd1fd3c55848cb2dd2a1b9920276bb0063c0e3700d2d6f604635bdda8ead2ce", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 203776, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "416072b8397e5025dae0a16ce7c734e7cd40c8c33baa65d0d3b29761e9dfacbc", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 203776, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 448, "416072b8397e5025dae0a16ce7c734e7cd40c8c33baa65d0d3b29761e9dfacbc", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 207872, "gemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 448, "0d4c6ba1dfd9cffc117d4dc341f090a202db2358d72b42f6dc9f891f4d846e12", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(17826818) +, /* mDtypeB */ trtllm::gen::Dtype(17826818) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) +, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 64 +, /* mMmaKind */ trtllm::gen::MmaKind(4) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 512 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(1) +, /* mSfLayoutC */ trtllm::gen::SfLayout(1) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 138240, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "c06536f6d5d938daa1f33e630bed439831af757dd44d055b46f562675b9512d7", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 138240, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "3adb82e655205a0d821e860428f5cd55a03ad06d5d01b98372bec55db3a37899", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 128 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 128 +, /* mTileK */ 128 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 198656, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "8812b1becc29e5ab522797b5951bbb721d6022c46d8bab6f5756f2f4999e5fc7", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 198656, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "ce8d3fa4a4e6c25f79ecfe60ce641b8bd0df788e551b213449416f99a005d93c", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 16 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 16 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 16 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 227328, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "f6ee8c1ff3d850c29a7c078cfbed67e92d53075acb425b8dc0d6a58d52e7f913", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 227328, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "6ffde3f730bf6203df01fd6b3874d9dee623da09ae3c05e990750cb3bd3c8bb5", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 32 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 32 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 5 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 32 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 186368, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "606a2af8f34b76d77d2e3bb69e7d8a515e39f9ea90a9bb45d4986b3a0cb3a163", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 186368, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "69508eda91440eb9eb9b402dcab2c4b90352023dfbd5a0cd2cb7018a60c3a972", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 64 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 64 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 3 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 64 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 219136, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "ad022f58f0c498ff1089adde0c7bbb4a666dc1c261ef7af3b67d9e645116fd69", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "647eba9e708e696300ae306a59cbd6f92bc9e300cf4519012a828dff8e06b6d9", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 0 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 219136, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a", 224, "ca75d11c5f5dd5f1cb64a60fd95b3e9d6091987eb354e0e5a9b8980106660f2a", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 1 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 1 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(0) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, +{GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin, GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin_len, 215040, "gemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a", 224, "eecb5bf45dfafe6bf0c12d8fb0d2dcc5940a821e6770150420cf7df8bdba04f2", { { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +, /* mBiasType */ gemm::BiasType(0) +, /* mBlockK */ -1 +, /* mClusterDimX */ 1 +, /* mClusterDimY */ 1 +, /* mClusterDimZ */ 4 +, /* mDtypeAcc */ trtllm::gen::Dtype(1056776) +, /* mDtypeA */ trtllm::gen::Dtype(1050629) +, /* mDtypeB */ trtllm::gen::Dtype(1050629) +, /* mDtypeC */ trtllm::gen::Dtype(1056776) +, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) +, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEnablesEarlyExit */ 0 +, /* mEnablesDelayedEarlyExit */ 0 +, /* mEnablesGlobalPtxKnobs */ 1 +, /* mEpilogueLdtmDps */ 16 +, /* mEpilogueLdtmBits */ 256 +, /* mEpilogueTileM */ 128 +, /* mEpilogueTileN */ 8 +, /* mGridTriggerSecondaryA */ 0 +, /* mGridTriggerSecondaryB */ 0 +, /* mGridWaitForPrimaryEarlyExit */ 1 +, /* mGridWaitForPrimaryA */ 1 +, /* mGridWaitForPrimaryB */ 1 +, /* mHoistLoadTaskInit */ 1 +, /* mHoistMmaTaskTryWaits */ 0 +, /* mK */ 2048 +, /* mKernelTraits */ {} +, /* mLayoutA */ gemm::MatrixLayout(0) +, /* mLayoutB */ gemm::MatrixLayout(0) +, /* mM */ 256 +, /* mMmaK */ 32 +, /* mMmaKind */ trtllm::gen::MmaKind(2) +, /* mMmaM */ 128 +, /* mMmaN */ 8 +, /* mMockAllReduce */ 0 +, /* mN */ 256 +, /* mNumSlicesForSplitK */ 4 +, /* mNumSlicesForSliceK */ 1 +, /* mNumStages */ 6 +, /* mNumStagesMma */ 1 +, /* mNumStagesMmaWithinWorkTile */ 1 +, /* mNumStagesMmaAcrossWorkTile */ 1 +, /* mNumStagesWorkId */ 3 +, /* mOutputDebugTensors */ 0 +, /* mPatchF2fp */ 0 +, /* mUseShuffledMatrixA */ 1 +, /* mSliceK */ 0 +, /* mSplitK */ gemm::SplitK(2) +, /* mTransposeMmaOutput */ 1 +, /* mTileM */ 128 +, /* mTileN */ 8 +, /* mTileK */ 256 +, /* mUseUnrollLoop2xForMma */ 1 +, /* mUseCustomMmaSchedule */ 1 +, /* mUseHoistTryWaitForCustomMmaSchedule */ 0 +, /* mUseDeepSeekFp8 */ 0 +, /* mUsePerTokenSfA */ 0 +, /* mUsePerTokenSfB */ 0 +, /* mUseTmaStore */ 1 +, /* mUseTwoTmaLoadWarps */ 1 +, /* mUseTwoMmaWarps */ 0 +, /* mSfLayoutA */ trtllm::gen::SfLayout(3) +, /* mSfLayoutB */ trtllm::gen::SfLayout(3) +, /* mSfLayoutC */ trtllm::gen::SfLayout(3) +, /* mSfReshapeFactor */ 1 +, /* mTileScheduler */ gemm::TileScheduler(0) + }, /* mActType */ gemmGatedAct::ActType(0) +, /* mClampBeforeAct */ 0 + }, gemm::SmVersion::Sm100a}, #endif // EXCLUDE_SM_100 }; // clang-format on } // namespace kernels } // namespace tensorrt_llm +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h index 4a7bde2a17..dc6c9a928b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelParams.h @@ -26,6 +26,9 @@ namespace gemmGatedAct { +namespace gemmGatedAct +{ + //////////////////////////////////////////////////////////////////////////////////////////////////// namespace tg = trtllm::gen; @@ -34,7 +37,7 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& shapes, - std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr) + std::vector const& strides, std::vector const& tileShapes, void* gmemAddr) { CUtensorMap desc{}; // The data type. @@ -67,22 +70,22 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype)) / /* bits */ 8; - if ((tileKSizeInBytes % 128) == 0) + int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype)) / /* bits */ 8; + if ((fastestDimTileSizeBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((tileKSizeInBytes % 64) == 0) + else if ((fastestDimTileSizeBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((tileKSizeInBytes % 32) == 0) + else if ((fastestDimTileSizeBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; } else { - std::cerr << "Unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; + std::cerr << "Unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes << std::endl; assert(false); } @@ -91,8 +94,8 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions. - assert(dim == 2); + // Expect 2 dimensions for regular gemm or 3 dimensions for blocked layout + assert(dim == 2 || dim == 3); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -117,19 +120,31 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); + auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); - // Build tile shapes. - std::vector tileShapes(dim, 1); - tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK - tileShapes[1] = tileSizeMn; // tileSizeMn + // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. + assert(static_cast(tileShapes.size()) <= dim); + std::vector boxDim(dim, 1); + boxDim[0] = numEltsInClampedFastestTileSize; + for (size_t ii = 1; ii < tileShapes.size(); ++ii) + { + if (tileShapes[ii] > 256) + { + std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; + assert(false); + } + else + { + boxDim[ii] = tileShapes[ii]; + } + } // Set tile strides to 0; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); @@ -140,10 +155,34 @@ CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& s std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "Shape: " << shapes[0] << " " << shapes[1] << std::endl; - std::cerr << "Stride: " << stridesInBytes[0] << std::endl; - std::cerr << "tileShapes: " << tileShapes[0] << " " << tileShapes[1] << std::endl; - std::cerr << "tileStrides: " << tileStrides[0] << " " << tileStrides[1] << std::endl; + + std::cerr << "Shape: "; + for (int ii = 0; ii < dim; ++ii) + { + std::cerr << shapes[ii] << " "; + } + std::cerr << std::endl; + + std::cerr << "Stride: "; + for (int ii = 0; ii < dim - 1; ++ii) + { + std::cerr << stridesInBytes[ii] << " "; + } + std::cerr << std::endl; + + std::cerr << "tileShapes: "; + for (int ii = 0; ii < dim; ++ii) + { + std::cerr << boxDim[ii] << " "; + } + std::cerr << std::endl; + + std::cerr << "tileStrides: "; + for (int ii = 0; ii < dim; ++ii) + { + std::cerr << tileStrides[ii] << " "; + } + std::cerr << std::endl; std::cerr << "swizzleType: " << int(swizzleType) << std::endl; assert(false); } @@ -165,16 +204,54 @@ struct KernelParams // TMA descriptor for A. // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. Logical shape is [M, K]. Logical strides are [K, 1]. Tile box shape is - // [tileM, tileK]. Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeElt. + // makeTmaShapeStrideAb. + // + // If layoutA is MatrixLayout::MajorK + // Logical shape is [M, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeA. + // + // If layoutA is MatrixLayout::MajorMn + // Logical shape is [K, M]. + // Logical strides are [M, 1]. + // Tile box shape is [tileK, tileM]. + // Tile box strides are [tileM, 1]. + // Dtype is set from options.mDtypeA. + // + // If layoutA is MatrixLayout::BlockMajorK + // Logical shape is [K / blockK, M, blockK]. + // Logical strides are [M * blockK, blockK, 1]. + // Tile box shape is [tileK / min(blockK, tileK), tileM, min(blockK, tileK)]. + // Tile box strides are [tileM * min(blockK, tileK), min(blockK, tileK), 1]. + // Dtype is set from options.mDtypeA, and blockK is 128B. CUtensorMap tmaA; // TMA descriptor for B. // Must be setup using gemm::buildNdTmaDescriptor with shapes and strides from - // makeTmaShapeStrideAb. Logical shape is [N, K]. Logical strides are [K, 1]. Tile box shape is - // [tileN, tileK]. Tile box strides are [tileK, 1]. - // Dtype is set from options.mDtypeElt. + // makeTmaShapeStrideAb. + // + // If layoutB is MatrixLayout::MajorK + // Logical shape is [N, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. + // Dtype is set from options.mDtypeB. + // + // If layoutB is MatrixLayout::MajorMn + // Logical shape is [K, N]. + // Logical strides are [N, 1]. + // Tile box shape is [tileK, tileN]. + // Tile box strides are [tileN, 1]. + // Dtype is set from options.mDtypeB. + // + // If layoutB is MatrixLayout::BlockMajorK + // Logical shape is [K / blockK, N, blockK]. + // Logical strides are [N * blockK, blockK, 1]. + // Tile box shape is [tileK / min(blockK, tileK), tileN, min(blockK, tileK)]. + // Tile box strides are [tileN * min(blockK, tileK), min(blockK, tileK), 1]. + // Dtype is set from options.mDtypeB, and blockK is 128B. CUtensorMap tmaB; // TMA descriptor for C, (when useTmaStore is true) @@ -236,21 +313,21 @@ struct KernelParams // When transposeMmaOutput is true, the shape is [N, M / 2]. // Otherwise, the shape is [M, N / 2]. // Elements in a given row are stored contiguously in memory (row-major). - void* ptrC; + void* ptrC{nullptr}; // The scaling factors to dequantize A. // It is used when the DeepSeek FP8 recipe is enabled. Otherwise should be set to nullptr. // If transposeMmaOutput is false, shape is [K / 128, M]. // Otherwise, shape is [M / 128, K / 128]. // The rightmost dimension is contiguous in memory. - void const* ptrSfA; + void const* ptrSfA{nullptr}; // The scaling factors to dequantize B. // It is used when the DeepSeek FP8 recipe is enabled. Otherwise should be set to nullptr. // If transposeMmaOutput is false, shape is [N / 128, K / 128]. // Otherwise, shape is [K / 128, N]. // The rightmost dimension is contiguous in memory. - void const* ptrSfB; + void const* ptrSfB{nullptr}; // The per-token scaling factors from scale A. // @@ -260,7 +337,7 @@ struct KernelParams // transposed). The dtype is Dtype::Bfloat16 // // The shape is [M] - void const* ptrPerTokenSfA; + void const* ptrPerTokenSfA{nullptr}; // The per-token scaling factors from scale B. // @@ -270,7 +347,22 @@ struct KernelParams // transposed). The dtype is Dtype::Bfloat16 // // The shape is [N] - void const* ptrPerTokenSfB; + void const* ptrPerTokenSfB{nullptr}; + + // The bias applied after the GEMM and before the activation function. + // The bias is applied before applying the global scaling factor. I.e. + // C = act(A * B + bias') * scaleC + // scaleC = dequantA * dequantB * quantC + // Thus, the bias' = bias / (dequantA * dequantB), where the bias is the original bias. + // + // if BiasType is N, the shape is [N] + // The bias is broadcasted along the M dimension. + // + // if BiasType is M, the shape is [M] + // The bias is broadcasted along the N dimension. + // + // The dtype is float32. + void const* ptrBias{nullptr}; // The scaling factors calculated when quantizing C, for MxFp{4,8} and NvFp4 formats, also // used for the DeepSeek FP8 recipe. @@ -284,7 +376,7 @@ struct KernelParams // If transposeMmaOutput is false, shape is [M, N / 2 / 16]. // Otherwise, shape is [N, M / 2 / 16]. // The layout is controlled by options.mSfLayoutC (either R128c4 or R8c4). - void* ptrSfC; + void* ptrSfC{nullptr}; // Output is equal to // y = act(ptrScaleGate[0] * y1) * (ptrScaleC[0] * y2) @@ -292,10 +384,29 @@ struct KernelParams // The output tensor scaling factor for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. // TensorRT-LLM API requires a scaling factor on the device. // Shape is [1]. - float const* ptrScaleC; + float const* ptrScaleC{nullptr}; // The output gate scale for MxFp{4,8}, NvFp4 and DeepSeek FP8 quantization. // Shape is [1]. - float const* ptrScaleGate; + float const* ptrScaleGate{nullptr}; + + // The clamp limit before the activation. + // Shape is [1]. + // Clamp is INF if nullptr. + // If applied on SwiGlu, it will be: + // + // x_glu = x_glu.clamp(min=None, max=limit) + // x_linear = x_linear.clamp(min=-limit, max=limit) + float const* ptrClampLimit{nullptr}; + + // The alpha and beta for SwiGlu. + // Shape is [1]. One alpha and one beta per tensor in batch. + // Alpha is 1.f if nullptr. + // Beta is 0.f if nullptr. + // The formula: + // + // out_glu = x_glu * torch.sigmoid(alpha * x_glu) * (x_linear + beta) + float const* ptrSwiGluAlpha{nullptr}; + float const* ptrSwiGluBeta{nullptr}; // The M dimension. // It is the total number of tokens if A is the activation matrix. @@ -328,12 +439,12 @@ struct KernelParams // Pointer for partial row max for DeepSeek FP8 recipe. // This is temporary storage for the row max results. // The shape is [2, M, N / 128] and the dtype is float. - float* ptrPartialRowMax; + float* ptrPartialRowMax{nullptr}; // Flags in global memory that sync on "exit" for row max computation. // The size is numTilesM * numTilesN / 2 and the dtype is uint32_t. // The memory must be set to 0 before the kernel launch. - uint32_t* ptrRowMaxCompletionBars; + uint32_t* ptrRowMaxCompletionBars{nullptr}; enum class MatrixType { @@ -346,13 +457,24 @@ struct KernelParams template static auto makeTmaShapeStrideAbc(GemmOptions const& options, MatrixType matrixType) { + // The outer dimension. auto numTokens = (matrixType == MatrixType::MatrixA || matrixType == MatrixType::MatrixC) ? options.mM : options.mN; + // The outer dimension tile size. + auto tileNumTokens = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileM + : (matrixType == MatrixType::MatrixA) ? options.mTileM + : options.mTileN; + // The inner dimension. auto hiddenSize = (matrixType == MatrixType::MatrixC) ? options.mN / 2 : options.mK; + // The inner dimension tile size. + auto tileHiddenSize = (matrixType == MatrixType::MatrixC) ? options.mEpilogueTileN / 2 : options.mTileK; + // Swap variables if transpose output if (matrixType == MatrixType::MatrixC && options.mTransposeMmaOutput) { numTokens = options.mN; hiddenSize = options.mM / 2; + tileNumTokens = options.mEpilogueTileN; + tileHiddenSize = options.mEpilogueTileM / 2; } // The cute tensor shape for A/B: (numTokens, hiddenSize). // Note that TMA descriptor expects the first dimension's stride to be @@ -363,12 +485,41 @@ struct KernelParams // Swap the first two dimension as mentioned before. auto stride = std::vector{1, static_cast(hiddenSize)}; - return std::make_tuple(shape, stride); + // Assemble the box shape + std::vector tileShape = {tileHiddenSize, tileNumTokens}; + + // Alternate layouts do not apply to matrixC + if (matrixType != MatrixType::MatrixC) + { + gemm::MatrixLayout layout = (matrixType == MatrixType::MatrixA) ? options.mLayoutA : options.mLayoutB; + if (layout == gemm::MatrixLayout::MajorMn) + { + // Apply transpose if necessary + std::swap(shape[0], shape[1]); + stride[1] = numTokens; + std::swap(tileShape[0], tileShape[1]); + } + else if (layout == gemm::MatrixLayout::BlockMajorK) + { + // Set shapes based on blocking layout + shape = {static_cast(options.mBlockK), static_cast(numTokens), + static_cast(options.mK / options.mBlockK)}; + stride + = {1, static_cast(options.mBlockK), static_cast(numTokens * options.mBlockK)}; + + // If blockK > tileK, then the inner most box size will be based on the tile + int32_t const tileBlockK = std::min(options.mBlockK, tileHiddenSize); + tileShape = {tileBlockK, tileNumTokens, tileHiddenSize / tileBlockK}; + } + } + + return std::make_tuple(shape, stride, tileShape); } // Create the TMA shape/stride for A/B block scaling factors. template - static auto makeTmaShapeStrideSfAb(GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout) + static auto makeTmaShapeStrideSfAb( + GemmOptions const& options, MatrixType matrixType, tg::SfLayout layout, int sfReshapeFactor) { // The outer dimension. auto numTokens = matrixType == MatrixType::MatrixA ? options.mM : options.mN; @@ -378,8 +529,10 @@ struct KernelParams auto numTokensPerTile = matrixType == MatrixType::MatrixA ? options.mTileM : options.mTileN; // The inner tile dimension. auto hiddenSizePerTile = options.mTileK; + // The dtype of the matrix. + tg::Dtype matrixDtype = matrixType == MatrixType::MatrixA ? options.mDtypeA : options.mDtypeB; // Number of elements per scaling factor. - int32_t const numEltsPerSf = (options.mDtypeElt == tg::Dtype::E2m1) ? 16 : 32; + int32_t const numEltsPerSf = (matrixDtype == tg::Dtype::E2m1) ? 16 : 32; switch (layout) { @@ -417,15 +570,36 @@ struct KernelParams { // The scaling factor tensor packs 8x4 tiles into contiguous 32B blocks. // - // As the inner dimension (k) is required to be a multiple of the tile size, we - // can reshape to use fewer read requests, if the tile dimensions allow. + // As the inner dimension (k) is often a multiple of the tile size, we can reshape to use + // fewer read requests, if the tile dimensions allow. It does not reduce the number of + // instructions. + // // I.e., let's define r = min(⌈hiddenSizePerTile / (numEltsPerSf * 4)⌉, 8) // - // The "logical" tensor is: [outer, inner / numEltsPerSf] - // The 8x4 SF layout is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf), 32] - // The TMA tensor shape is: [⌈outer / 128⌉, inner / (4 * numEltsPerSf * r), r * 32] + // The "logical" tensor is: [outer, inner / numEltsPerSf] + // The 8x4 SF layout is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf), 32] + // The TMA tensor shape is: [⌈outer / 8⌉, inner / (4 * numEltsPerSf * r), r * 32] + // + // The caveat of NumRepeats>1 is we must pad the hidden dimension of SF to multiples of + // NumRepeats * numEltsPerSf * 4. - int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), 8); + // Detect if the supplied factor is power of 2. E.g., 0b0100 and (0b0100 - 1) == 0b0000. + int const r = sfReshapeFactor; + if (r > 0 && (r & (r - 1)) != 0) + { + throw std::runtime_error( + "mSfReshapeFactor must be positive and a power of 2. Found " + std::to_string(r)); + } + + // Sanitize number of repeats so it doesn't exceed the dimension. + int const repeats = std::min(tg::ceilDiv(hiddenSizePerTile, numEltsPerSf * 4), r); + + // Detect if the input hidden size K is a multiple of the repeats. + if (tg::ceilDiv(hiddenSize, numEltsPerSf * 4) % repeats != 0) + { + throw std::runtime_error("SF hiddenSize K (" + std::to_string(tg::ceilDiv(hiddenSize, numEltsPerSf * 4)) + + ") must be a multiple of repeats (" + std::to_string(repeats) + ")"); + } auto shape = std::vector{static_cast(repeats * 32), static_cast(tg::ceilDiv(hiddenSize, numEltsPerSf * 4 * repeats)), @@ -445,7 +619,7 @@ struct KernelParams return std::make_tuple(shape, stride, tileShapes); } - default: assert(false); + default: throw std::runtime_error("Unsupported SF layout"); } return std::make_tuple(std::vector{}, std::vector{}, std::vector{}); } @@ -453,38 +627,43 @@ struct KernelParams // Setup the kernel parameters. template static KernelParams setKernelParams(GemmOptions_ const& options, void const* ptrA, void const* ptrSfA, - void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, void* ptrC, - float const* ptrScaleC, void* ptrSfC, float const* ptrScaleGate, float* rowMax, uint32_t* rowMaxBars) + void const* ptrPerTokenSfA, void const* ptrB, void const* ptrSfB, void const* ptrPerTokenSfB, + void const* ptrBias, void* ptrC, float const* ptrScaleC, void* ptrSfC, float const* ptrScaleGate, + float const* ptrClampLimit, float const* ptrSwiGluAlpha, float const* ptrSwiGluBeta, float* rowMax, + uint32_t* rowMaxBars) { // Create the return struct. KernelParams params; // Shape/stride for gmem tensor A. - auto [shapeA, strideA] = makeTmaShapeStrideAbc(options, MatrixType::MatrixA); + auto [shapeA, strideA, tileShapeA] = makeTmaShapeStrideAbc(options, MatrixType::MatrixA); // Build tma descriptor for A. - params.tmaA = gemmGatedAct::buildNdTmaDescriptor( - options.mDtypeElt, shapeA, strideA, options.mTileM, options.mTileK, const_cast(ptrA)); + params.tmaA + = gemmGatedAct::buildNdTmaDescriptor(options.mDtypeA, shapeA, strideA, tileShapeA, const_cast(ptrA)); // Shape/stride for gmem tensor B. - auto [shapeB, strideB] = makeTmaShapeStrideAbc(options, MatrixType::MatrixB); + auto [shapeB, strideB, tileShapeB] = makeTmaShapeStrideAbc(options, MatrixType::MatrixB); // Build tma descriptor for B. - params.tmaB = gemmGatedAct::buildNdTmaDescriptor( - options.mDtypeElt, shapeB, strideB, options.mTileN, options.mTileK, const_cast(ptrB)); + params.tmaB + = gemmGatedAct::buildNdTmaDescriptor(options.mDtypeB, shapeB, strideB, tileShapeB, const_cast(ptrB)); - if (options.mDtypeElt == tg::Dtype::E2m1 || options.mDtypeElt == tg::Dtype::MxE4m3) + if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3) { - tg::Dtype const dTypeSf = tg::dtypeGetBlockSfType(options.mDtypeElt); + tg::Dtype const dTypeSf = tg::dtypeGetBlockSfType(options.mDtypeA); // Build TMA descriptor for gmem A block scaling factors. auto [shapeSfA, strideSfA, tileShapesSfA] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4); + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixA, tg::SfLayout::R128c4, options.mSfReshapeFactor); params.tmaSfA = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfA, strideSfA, tileShapesSfA, const_cast(ptrSfA)); - + } + if (options.mDtypeB == tg::Dtype::E2m1 || options.mDtypeB == tg::Dtype::MxE4m3) + { + tg::Dtype const dTypeSf = tg::dtypeGetBlockSfType(options.mDtypeB); // Build TMA descriptor for gmem B block scaling factors. auto [shapeSfB, strideSfB, tileShapesSfB] - = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB); + = makeTmaShapeStrideSfAb(options, MatrixType::MatrixB, options.mSfLayoutB, options.mSfReshapeFactor); params.tmaSfB = gemm::buildSfTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(ptrSfB)); } @@ -492,14 +671,10 @@ struct KernelParams if (options.mUseTmaStore) { // Shape/stride for gmem tensor C. - auto [shapeC, strideC] = makeTmaShapeStrideAbc(options, MatrixType::MatrixC); - - // Swap M and N tiles for the M-major epilogue. - auto outputTileM = options.mTransposeMmaOutput ? options.mEpilogueTileN : options.mEpilogueTileM; - auto outputTileN = options.mTransposeMmaOutput ? options.mEpilogueTileM : options.mEpilogueTileN; + auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, MatrixType::MatrixC); // Build tma descriptor for C. params.tmaC = gemmGatedAct::buildNdTmaDescriptor( - options.mDtypeC, shapeC, strideC, outputTileM, outputTileN / 2, const_cast(ptrC)); + options.mDtypeC, shapeC, strideC, tileShapeC, const_cast(ptrC)); } params.ptrC = ptrC; @@ -512,8 +687,13 @@ struct KernelParams params.ptrPerTokenSfA = ptrPerTokenSfA; params.ptrPerTokenSfB = ptrPerTokenSfB; + params.ptrBias = ptrBias; + params.ptrScaleC = ptrScaleC; params.ptrScaleGate = ptrScaleGate; + params.ptrClampLimit = ptrClampLimit; + params.ptrSwiGluAlpha = ptrSwiGluAlpha; + params.ptrSwiGluBeta = ptrSwiGluBeta; params.rank = 0; params.tpGrpSize = 1; @@ -532,3 +712,5 @@ struct KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemmGatedAct + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h index 1c3d4581c4..34189eebb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/KernelTraits.h @@ -20,6 +20,10 @@ #include "trtllm/gen/CommonUtils.h" #include "trtllm/gen/DtypeDecl.h" #include +#include + +namespace gemmGatedAct +{ namespace gemm { @@ -74,6 +78,38 @@ public: } // Returns the offset of the ith chunk + int32_t getChunkOffsetByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getChunkOffset(ii); + } + } + throw std::runtime_error("Name not found: " + name); + } + + // Returns the first chunk reuse flag given chunk name. + int getFirstChunkReuseFlagByName(std::string const& name) const + { + for (size_t ii = 0; ii < mSmemChunkNames.size(); ++ii) + { + if (mSmemChunkNames[ii] == name) + { + return getFirstChunkReuseFlag(ii); + } + } + throw std::runtime_error("Name not found: " + name); + } + + // Function to calculate the total size of the SMEM array + int32_t getTotalSize() const + { + return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); + } + +private: int32_t getChunkOffset(int32_t ii) const { if (mFirstChunkReuse[ii]) @@ -88,12 +124,6 @@ public: return getSizePaddedToAlignment(offset, mNumBytesAndAlignmentPerSmemChunk[ii].second); } - // Function to calculate the total size of the SMEM array - int32_t getTotalSize() const - { - return getOffsetBeforeChunk(static_cast(mNumBytesAndAlignmentPerSmemChunk.size())); - } - // Returns the first chunk reuse flag for the ith chunk. int getFirstChunkReuseFlag(int32_t ii) const { @@ -132,6 +162,24 @@ private: //////////////////////////////////////////////////////////////////////////////////////////////////// +int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) +{ + if (mmaKind == tg::MmaKind::Auto) + { + throw std::runtime_error("mmaKind != tg::MmaKind::Auto"); + } + if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) + { + return 8; + } + else + { + return tg::dtypeGetNumBits(dtype); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + class KernelTraits { public: @@ -139,11 +187,13 @@ public: KernelTraits() {} // The constructor. - KernelTraits(tg::Dtype dtypeElt, tg::Dtype dtypeC, tg::Dtype dtypeAcc, int32_t tileM, int32_t tileN, int32_t tileK, - int32_t epilogueTileM, int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, - int32_t numSlicesForSplitK, int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, - bool transposeMmaOutput, AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, - bool usePerTokenSfA, bool usePerTokenSfB) + KernelTraits(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeAcc, tg::Dtype dtypeMmaA, + tg::Dtype dtypeMmaB, tg::MmaKind mmaKind, int32_t tileM, int32_t tileN, int32_t tileK, int32_t epilogueTileM, + int32_t epilogueTileN, int32_t numStages, int32_t numStagesMma, int32_t numSlicesForSplitK, + int32_t numSlicesForSliceK, SplitK splitK, bool useTmaStore, bool transposeMmaOutput, + AllReduceAlgo allReduceAlgo, bool usePersistentScheduler, bool useDeepSeekFp8, bool usePerTokenSfA, + bool usePerTokenSfB, BiasType biasType) + : mMmaKind{mmaKind} { // // SMEM @@ -157,13 +207,19 @@ public: // [rowMax ] (16B aligned) (if needed) // [sliceK ] (16B aligned) (if needed) // [per-token SF ] (16B aligned) (if needed) + // [bias ] (16B aligned) (if needed) // // SMEM for smemA and smemB might be repurposed and used for gmemC0 and gmemC1: // // [..smemA..][..smemB..][..smemBShuffle..] - // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..] + // [..gmemC0..][..gmemC1..][..rowMax..][..sliceK..][..per-token SF..][..bias..] // + if (mMmaKind == tg::MmaKind::Auto) + { + mMmaKind = dtypeGetMmaKind(dtypeMmaA, dtypeMmaB); + } + std::vector> numBytesAndAlignmentPerSmemChunk; std::vector firstChunkReuseSmem; // Buffer names for inspection purposes. @@ -172,7 +228,8 @@ public: // LoadA { // Number of bytes in load A shared memory. - auto const numSmemBytesLoadA = numStages * tileM * tileK * tg::dtypeGetNumBits(dtypeElt) / 8 /* bits */; + auto const numSmemBytesLoadA + = numStages * tileM * tileK * getNumSmemBitsPerElt(dtypeA, mMmaKind) / 8 /* bits */; // Number of bytes for load A alignment for TMA load. auto const numBytesAlignmentLoadA = 1024; // loadA is already at first chunk. No need to reuse it. @@ -187,7 +244,8 @@ public: // LoadB { // Number of bytes in load B shared memory. - auto const numSmemBytesLoadB = numStages * tileN * tileK * tg::dtypeGetNumBits(dtypeElt) / 8 /* bits */; + auto const numSmemBytesLoadB + = numStages * tileN * tileK * getNumSmemBitsPerElt(dtypeB, mMmaKind) / 8 /* bits */; // Number of bytes for load B alignment for TMA load. auto const numBytesAlignmentLoadB = 1024; // No need to reuse the first chunk. @@ -207,7 +265,7 @@ public: { // Number of bytes in save shuffled B in shared memory. auto const numSmemBytesLoadB = numSlicesForSliceK > 1 - ? numStages * tileN * tileK * tg::dtypeGetNumBits(dtypeElt) / 8 /* bits */ + ? numStages * tileN * tileK * getNumSmemBitsPerElt(dtypeB, mMmaKind) / 8 /* bits */ : 0; // Number of bytes for load B alignment for TMA load. auto const numBytesAlignmentLoadB = 1024; @@ -313,6 +371,29 @@ public: firstChunkReuseSmem.emplace_back(false); } + // Bias + { + int32_t numBytesSmemBias = 0; + if (isBiasTypeN(biasType)) + { + numBytesSmemBias = tileN * sizeof(float); + } + else if (isBiasTypeM(biasType)) + { + numBytesSmemBias = tileM * sizeof(float); + } + else if (isBiasTypeMn(biasType)) + { + numBytesSmemBias = tileM * tileN * sizeof(float); + } + // Number of bytes alignment for bias + auto const numBytesAlignmentBias = 16; + // Add info. + smemChunkNames.emplace_back("smemBias"); + numBytesAndAlignmentPerSmemChunk.emplace_back(std::make_pair(numBytesSmemBias, numBytesAlignmentBias)); + firstChunkReuseSmem.emplace_back(false); + } + // Per-block absolute maximum for multi-warp reduction. { // Number of bytes: number of epilogue warps * number of tile columns. @@ -327,6 +408,25 @@ public: firstChunkReuseSmem.emplace_back(false); } + // SmemConstSfBuf + // A buffer used to copy constant values to TMEM. + { + // Do we need the buffer? + bool const useConstSfBuf = dtypeB == tg::Dtype::E4m3 && dtypeMmaB == tg::Dtype::MxE4m3; + // Number of bytes for the buffer. + auto const numSmemBytesConstSfBuf = useConstSfBuf ? 512 : 0; + // Number of bytes for the alignment of the buffer. + auto const numBytesAlignmentConstSfBuf = 16; + // No need to reuse the first chunk. + auto const reuseChunksSmemConstSfBuf = false; + + // Add info. + smemChunkNames.emplace_back("smemConstSfBuf"); + numBytesAndAlignmentPerSmemChunk.emplace_back( + std::make_pair(numSmemBytesConstSfBuf, numBytesAlignmentConstSfBuf)); + firstChunkReuseSmem.emplace_back(reuseChunksSmemConstSfBuf); + } + // Create SMEM helper object. mSmemAllocatorHelper = MemAllocatorHelper(numBytesAndAlignmentPerSmemChunk, firstChunkReuseSmem, smemChunkNames); @@ -370,10 +470,12 @@ public: // Matrix A { + // We use TMEM for A if we use slice-K or if we need to cast A. + bool const useTmemA = (numSlicesForSliceK > 1) || (dtypeMmaA != dtypeA); // Number of columns for A. - auto const numTmemColsA = numSlicesForSliceK > 1 ? numStages * tileK - / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeElt)) - : 0; + auto const numTmemColsA = useTmemA ? numStages * tileK + / (numSlicesForSliceK * tg::dtypeGetNumBits(tg::Dtype::UInt32) / tg::dtypeGetNumBits(dtypeMmaA)) + : 0; // Number of columns for A alignment. auto const numColsAlignmentA = 4; // No need to reuse TMEM. @@ -385,15 +487,18 @@ public: firstChunkReuseTmem.emplace_back(reuseChunksTmemA); } - bool const useBlockScaling = tg::dtypeIsBlockFmt(dtypeElt); - // Sf A { + // Does the MMA require block scales in TMEM for A? + bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); + // Are the block scales constant? + bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); // Number of columns for scaling factors of A. - auto const numTmemColsSfA - = useBlockScaling ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0; + auto const numTmemColsSfA = useConstSfA + ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileM, 64), 4) + : (useBlockScalingA ? ((tileK / 64) * 2 * tg::ceilDiv(tileM, 64)) * numStages : 0); // Number of columns for Sf alignment. - auto const numColsAlignmentSfA = 2; + auto const numColsAlignmentSfA = 4; // No need to reuse TMEM. auto const reuseChunksTmemSfA = false; @@ -405,11 +510,16 @@ public: // Sf B { + // Does the MMA require block scales in TMEM for B? + bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); + // Are the block scales constant? + bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); // Number of columns for scaling factors of B. - auto const numTmemColsSfB - = useBlockScaling ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0; + auto const numTmemColsSfB = useConstSfB + ? tg::roundUp((tileK / 64) * 2 * tg::ceilDiv(tileN, 64), 4) + : (useBlockScalingB ? ((tileK / 64) * 2 * tg::ceilDiv(tileN, 64)) * numStages : 0); // Number of columns for Sf alignment. - auto const numColsAlignmentSfB = 2; + auto const numColsAlignmentSfB = 4; // No need to reuse TMEM. auto const reuseChunksTmemSfB = false; @@ -426,6 +536,8 @@ public: } public: + // The MMA kind. + tg::MmaKind mMmaKind; // Helper for SMEM allocation. MemAllocatorHelper mSmemAllocatorHelper; // Helper for TMEM allocation. @@ -454,14 +566,14 @@ inline int32_t getTmemBufferSize(KernelTraits traits) inline int32_t getSmemOffsetLoadA(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(0); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetLoadB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(1); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemLoadB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -475,50 +587,63 @@ inline int32_t getSmemOffsetLoadAb(KernelTraits traits) inline int32_t getSmemOffsetLoadShuffleB(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(2); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBShuffle"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetGmemC(KernelTraits traits, int resIdx = 0) { - return traits.mSmemAllocatorHelper.getChunkOffset(3 + resIdx); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetRowMax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(5); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemRowMax"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetSliceK(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(6); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemSliceK"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetPerTokenSf(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(7); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemPerTokenSf"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetBias(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBias"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getSmemOffsetBlockAmax(KernelTraits traits) { - return traits.mSmemAllocatorHelper.getChunkOffset(8); + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemBlockAmax"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline int32_t getSmemOffsetConstSfBuf(KernelTraits traits) +{ + return traits.mSmemAllocatorHelper.getChunkOffsetByName("smemConstSfBuf"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) { - // Be conscious that the index (3 + resIdx) should match the index in getSmemOffsetGmemC(). - return traits.mSmemAllocatorHelper.getFirstChunkReuseFlag(3 + resIdx); + return traits.mSmemAllocatorHelper.getFirstChunkReuseFlagByName("smemGmemC" + std::to_string(resIdx)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -529,30 +654,32 @@ inline int32_t isSmemAbRepurposedToGmemC(KernelTraits traits, int resIdx = 0) inline int32_t getTmemOffsetD(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(0); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemD"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(1); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfA(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(2); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfA"); } //////////////////////////////////////////////////////////////////////////////////////////////////// inline int32_t getTmemOffsetSfB(KernelTraits traits) { - return traits.mTmemAllocatorHelper.getChunkOffset(3); + return traits.mTmemAllocatorHelper.getChunkOffsetByName("tmemSfB"); } //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h index 8d26c4b972..159169e4a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/TmaDescriptor.h @@ -17,6 +17,7 @@ #pragma once #include "trtllm/gen/DtypeDecl.h" +#include "trtllm/gen/MmaDecl.h" #include #ifdef TLLM_ENABLE_CUDA @@ -25,6 +26,9 @@ #include #endif +namespace gemmGatedAct +{ + namespace gemm { @@ -36,13 +40,15 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA -inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& shapes, - std::vector const& strides, int32_t tileSizeMn, int32_t tileSizeK, void* gmemAddr, bool doSwizzle = true) +inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, + std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) { + // The multiplication factor of the data padding in SMEM. + int32_t padMultiplier = 1; CUtensorMap desc{}; // The data type. CUtensorMapDataType tmaDataFormat{CU_TENSOR_MAP_DATA_TYPE_FLOAT32}; - if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3) + if (dtype == tg::Dtype::E4m3 || dtype == tg::Dtype::MxE4m3 || dtype == tg::Dtype::UE8m0) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_UINT8; } @@ -58,36 +64,56 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector c { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } + else if (dtype == tg::Dtype::MxE2m1) + { + if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) + { + padMultiplier = 2; + tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; + } + else + { + // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision + // type such as Bfloat16 before the MMA. + tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; + } + } else if (dtype == tg::Dtype::Fp32) { tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_FLOAT32; } else { - std::cerr << "buildNdTmaDescriptor: unexpected dtype " << static_cast(dtype) << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected dtype " << tg::dtypeToString(dtype) << std::endl; assert(false); } // The swizzle type. CUtensorMapSwizzle swizzleType{CU_TENSOR_MAP_SWIZZLE_NONE}; - int32_t tileKSizeInBytes = (tileSizeK * tg::dtypeGetNumBits(dtype)) / /* bits */ 8; + int32_t fastestDimTileSizeBytes = (tileShapes[0] * tg::dtypeGetNumBits(dtype) * padMultiplier) / /* bits */ 8; if (doSwizzle) { - if ((tileKSizeInBytes % 128) == 0) + if ((fastestDimTileSizeBytes % 128) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_128B; } - else if ((tileKSizeInBytes % 64) == 0) + else if ((fastestDimTileSizeBytes % 64) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_64B; } - else if ((tileKSizeInBytes % 32) == 0) + else if ((fastestDimTileSizeBytes % 32) == 0) { swizzleType = CU_TENSOR_MAP_SWIZZLE_32B; + // This path is only for the scaling factors. + } + else if ((fastestDimTileSizeBytes % 16) == 0 && (dtype == tg::Dtype::UE8m0 || dtype == tg::Dtype::E4m3)) + { + swizzleType = CU_TENSOR_MAP_SWIZZLE_NONE; } else { - std::cerr << "buildNdTmaDescriptor: unexpected tileKSizeInBytes " << tileKSizeInBytes << std::endl; + std::cerr << "buildNdTmaDescriptor: unexpected fastestDimTileSizeBytes " << fastestDimTileSizeBytes + << std::endl; assert(false); } } @@ -97,8 +123,9 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector c // Check shape must be in range [1, 2^32] int32_t dim = shapes.size(); - // Expect 2 dimensions. - assert(dim == 2 || dim == 3); + // Expect 2 dimensions for regular gemm, 3 dimensions for batched gemm or blocked layout, and 4 + // dimensions for batched gemm with blocked layout. + assert(dim == 2 || dim == 3 || dim == 4); // Check shape range. for (int32_t ii = 0; ii < dim; ++ii) { @@ -119,63 +146,78 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector c } // Set the number of elements in the packed uint32_t element. - auto const numEltsPerUInt32 = 4 * /* bits */ 8 / tg::dtypeGetNumBits(dtype); + auto const numEltsPerUInt32 = 4 * /* bits */ 8 / (tg::dtypeGetNumBits(dtype) * padMultiplier); // The number of elements in 128B. auto const numEltsIn128B = numEltsPerUInt32 /*4B*/ * 32; // The number of tile K hidden size (per token) in each block of shared memory. - auto const numEltsInClampedTileKSize = std::min(numEltsIn128B, tileSizeK); + auto const numEltsInClampedFastestTileSize = std::min(numEltsIn128B, tileShapes[0]); - // Build tile shapes. - std::vector tileShapes(dim, 1); - tileShapes[0] = numEltsInClampedTileKSize; // tileSizeK - tileShapes[1] = tileSizeMn; // tileSizeMn + // Build box dim array. If tileShapes is smaller than dim, just fill with 1s. + assert(static_cast(tileShapes.size()) <= dim); + std::vector boxDim(dim, 1); + boxDim[0] = numEltsInClampedFastestTileSize; + for (size_t ii = 1; ii < tileShapes.size(); ++ii) + { + if (tileShapes[ii] > 256) + { + std::cerr << "buildNdTmaDescriptor: boxDim too large " << tileShapes[ii] << std::endl; + assert(false); + } + else + { + boxDim[ii] = tileShapes[ii]; + } + } // Set tile strides to 1; std::vector tileStrides(dim, 1); // Build the descriptor. CUresult result = cuTensorMapEncodeTiled(&desc, tmaDataFormat, - /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), tileShapes.data(), tileStrides.data(), + /*tensorRank=*/dim, gmemAddr, shapes.data(), stridesInBytes.data(), boxDim.data(), tileStrides.data(), /*interleave=*/CU_TENSOR_MAP_INTERLEAVE_NONE, swizzleType, /*l2Promotion=*/CU_TENSOR_MAP_L2_PROMOTION_L2_128B, /*oobFill=*/CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; + char const* errorString; + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor " << result << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "Shape: "; + ss << "Shape: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << shapes[ii] << " "; + ss << shapes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "Stride: "; + ss << "Stride: "; for (int ii = 0; ii < dim - 1; ++ii) { - std::cerr << stridesInBytes[ii] << " "; + ss << stridesInBytes[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes: "; + ss << "tileShapes: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileShapes[ii] << " "; + ss << boxDim[ii] << " "; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides: "; + ss << "tileStrides: "; for (int ii = 0; ii < dim; ++ii) { - std::cerr << tileStrides[ii] << " "; + ss << tileStrides[ii] << " "; } - std::cerr << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << std::endl; + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; @@ -193,7 +235,7 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c } else { - std::cerr << "buildSfTmaDescriptor: unexpected dtype " << static_cast(dtype) << std::endl; + std::cerr << "buildSfTmaDescriptor: unexpected dtype " << tg::dtypeToString(dtype) << std::endl; assert(false); } @@ -243,41 +285,44 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c if (result != CUDA_SUCCESS) { - std::cerr << "Error: Failed to initialize the TMA descriptor for SF " << result << std::endl; + char const* errorString; + cuGetErrorString(result, &errorString); + std::stringstream ss; + ss << "Error: Failed to initialize the TMA descriptor for SF " << errorString << std::endl; - std::cerr << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr - << std::endl; + ss << "tmaFormat: " << static_cast(tmaDataFormat) << " dim: " << dim << " gmem: " << gmemAddr << std::endl; - std::cerr << "shape:"; + ss << "shape:"; for (uint32_t shape_i : shapes) { - std::cerr << " " << shape_i; + ss << " " << shape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "stridesInBytes:"; + ss << "stridesInBytes:"; for (uint32_t stride_i : stridesInBytes) { - std::cerr << " " << stride_i; + ss << " " << stride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileShapes:"; + ss << "tileShapes:"; for (uint32_t tileShape_i : tileShapes) { - std::cerr << " " << tileShape_i; + ss << " " << tileShape_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "tileStrides:"; + ss << "tileStrides:"; for (uint32_t tileStride_i : tileStrides) { - std::cerr << " " << tileStride_i; + ss << " " << tileStride_i; } - std::cerr << std::endl; + ss << std::endl; - std::cerr << "swizzleType: " << int(swizzleType) << std::endl; - assert(false); + ss << "swizzleType: " << int(swizzleType) << std::endl; + ss << "(in " << __FILE__ << ":" << __LINE__ << ")" << std::endl; + throw std::runtime_error(ss.str()); } return desc; @@ -288,3 +333,5 @@ inline CUtensorMap buildSfTmaDescriptor(tg::Dtype dtype, std::vector c //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..03bd577f3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64825738ffb177c9d050477748f8b143a5658abcea5d24300ebe10c9d0c23812 +size 506744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..a20a7a571d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00331fdded9eecdf1b2740cb442656273238dbcf321f77df619c6f3c559a1d16 +size 529688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..9544808560 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f31f32f4cdccef3ad6d345ac6f9b590674d42747b0658cf8caf083a36aba1c4 +size 460762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..c2a2a84bfd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b25d48807b6fcabf1ae0cbcc8809024fd09ec97f60db67033d2067cbcca24b2 +size 481536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..15230e4c8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793d3fd1df23be23ae761a8e1dac7ec97e5d6f6a651bb1a888f77a71b503284d +size 469938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..d2cf3e8899 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b775ce5d102ed4d49fd33c961850e528df13f1d6cb299f5dfc64266af62895f9 +size 489872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..ab4033b623 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:487e827f2c18721fff15c08fd22e8fd7e72794fd4dfab41a4eae41dadf468c4c +size 483208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..10d9d48af9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a8fc5be199f1791fb864f464625575f4761fc2de2f845ef04726458231607aa +size 503834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..540784efc1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0b15f2ad9aa2f293f300494287a6666325b63077c9e40c40157fe2d0cb214e9 +size 454934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..bc3ec7e474 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e75ba9492900711a9ae00c4e7387e23f494d6dae0c0ff21c89f14caf3d9e56f3 +size 497278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..e18046141e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7336eadf575381b813459cce77160bd4d23d2f3b85cb7f616a22a02067e1ec82 +size 475658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..c0404246cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6869c1ac7aac3c41b3a2328467c863d87d8243bb3207554f8573974903851589 +size 518052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..13ae48d43c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10757cb724a2b2233459df54fc6a17dbbab191a887f408b1c9b95e6688f2a2e +size 440332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..11658595c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c838dff73be1bb253d0e3fc6ff2b723bbc72b00aa479b161817b597b7ea68f4 +size 483810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..4a277d8870 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ddf0104985fac9aaedf88f216a505424581226fc24dbc84a743a2e732e60b37 +size 465842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 5a1179b298..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9434907544b528b0f46ab2472402b64bf13a3405745d2c1ce02b2f962bc7c774 -size 297623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 4df2e220d6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b00b1627dea02258b7b5d895de301efd75c742ac468a220865aea997a704fb4c -size 331481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..21cb21b0b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:748dd243f2c036fe286035809d88894f9ed7bf3b99dbcc9c26fd17189dc531cd +size 322140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..6fd2dfcae7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3b8ddfc5a7faf55066548884109c8f8d1444e7da205b75c94870fce477d868 +size 333540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..757a4b7499 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b68b111fc706853d15b6c516ba254d637ea100882d7d67ac795e2da3fd500937 +size 268856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..46e7159b5b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28e4ccb1e8106112bf6cf59ce5e3db374ea1f005aeb9639a880d93f050efd4d4 +size 283462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..190871b5ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d530428dccc7b8c4e1ca35c5f73b12291a5039a31c61a5446957f8e72eda0c07 +size 278820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..4552e17821 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05556fc1ee16d50a9647b408b8930c9dfa39f07ea183ff4141b6bc406fc5d401 +size 293378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..2dd1d4dfed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:233ca1067b7c1fffe6cae9ba1061cbae8d3f96e84ab95030e51bf8e4ef6aa4d2 +size 294164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..188a28ddad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca227225f8536a9bb556aca35c17c17cd94268272cd0010abc352c47b5b3694a +size 308720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..71483a9d22 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6855d05d06a6e70e14b66ce6c81a67a6d46033892194ae75278e45c304bda94 +size 266530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..3c4de297d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8009c78c59a19e9d6c490e5a2f2e0bafbdb91dcb3ad1b2ed3715e0394129a22 +size 303990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..cc0619dc45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7bd392b67d3ae4f35a877901236bdeabfd8ee220883a52f685f1e524f166006 +size 281088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..df073ad7d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbcfce3357caf135eeddd27dac48dd357bad6ca7ea529a398c55e725e06feb63 +size 318598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp deleted file mode 100644 index e8d20c2364..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:418acca7365bcedd4fa73d756169184484666ffb5bfee08a281946e6c4d6d94b -size 297745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index e613213678..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:98ee8d43aa5f90e048eb334711b861de202f769457cc4db1703ef71d6c75ccf1 -size 222291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index abf57d4c8f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Bfloat16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e85b4cb09dda8f57b068b53030f19cdd0552407381bbc5bd50e32490e18c5e16 -size 254473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 409920f595..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a98affa59323662168205c915685f87a95c2408fad1c406056f226ec8739333b -size 305853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index ee2186243f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E2m1_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:460cf57675a1137a9cf0d851413404e2bf2ace1a52a6b427235f94ba1d580efa -size 342425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..8f1760adae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8045803f4f0ebb528ac2e29531b9dc26bc21afd061a0b17d33a602d3b1ad0544 +size 508316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..d4e0635bf8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:954a450602ec5289fb887c5c9338a2e966ff1dc0247d8937a2383cdd37ed2b3c +size 530470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..f4835e6326 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:733495c6cc399bc9e1a42f1884cd310feb1f8032c0e80bcc485cc82923a88c98 +size 460310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..54c30eede8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf405381e739345725068755c62be0399d11307ea323ecc8d080e6c83f099b23 +size 481034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..3f40ddc1d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b2f946e4fc0286396225af6fd19c93eb1a5fbf25ec74e4a607d216b3b3cc6a +size 468944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..34fc5f6675 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7796e9842511732e78c89624859678ec8cbdf7b22fd9cff6d841f2afde76806 +size 489668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..dc3a950bf7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd004a4e6b4840f77d3202ea7ff03235dacda2c5c55bde6c69057ea09240a8a5 +size 484384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..ffc9995118 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f4147ab63255303c2c8b501d8199ad1bb496d0e3f4ea11052018a5f239d8784 +size 505010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..7fe1b43496 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ab364f318eb520a9af8e5e198101136d3eb3dafee5f11ceb183a9eb5f3687f9 +size 453298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..c0bdd4150f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3242fdbb772c0b42521369a28da3a5a06657755bc262e3f079c0c0363000959 +size 496432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..7fcb732bf3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93be1f8aea377ef7a6ee13c00f515a8cd15e056574722bd2111112e02fd79f5b +size 474022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..e5fd58f64d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31fa226ea455ab5b5c066168f900af1c1136d2317225bb8ce6f168c2c3a68e02 +size 517206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..6ea9a32b82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3d19ffa5022ccd1278b055a2279ae7b1bf0811c3b624a1d833c79ff7f3a3414 +size 438696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..b36799e7ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39f56ccd9fc594fd3dde2548cbb52896c2a2a257ba333cf34c76a212a2852dc +size 482964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..c756442b90 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983cd7ea8f3a8c6e75934a6b4d45944c4063d960cdb8b1ff63b22a70b90d4b42 +size 464206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..3c8f1910ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3148297b5209ed22b3374512e97ac300303ea855fb305fc8ea224fbcb4396739 +size 323710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..dd4ef6fef9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69ab2d5cdb244aee7634a18afcd11f4ac368f3c3baa01c14e8d9943036092ede +size 335160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..2ce3228f85 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d9dbf86274414ba21e427f955701be108436ef71c130259a7422fe6b4675490 +size 269144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..4250c79f3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b5e4db1a0893aed5f4d3ec4cb7e21407295a626bf94096cec300d7f871c468 +size 283750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..e8623a6863 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d5d4215d6df0c40f641de7018137f891f3bea915345a83bb21e98d016f6096a +size 279404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..24402c98ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f47f97301ecb0ac5a21a255c5e3810d6a4b682e48ea85d36c14d45a4e5b35a8d +size 294012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..48b78357a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4982a83c9932f03e9906a1897e9d186500ddc3cf34072944cac537188a7b39e +size 295340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..859f749c76 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098e63c846db6214e364fe3b53a58b4843042904bcb3bbbfb6e63591ad361d4e +size 309896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..c0e4007081 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c7d5a65314dd35640b60d67317004610d52e06f79f9f09bc6bc8e41c724a23c +size 264894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..011bd9267f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f932750c588de648d9e2f5517b87191ac31a24604c227d5c84f1b86d552105f7 +size 302404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..6631691d96 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e424393c8140b5b90fd413e6e7b98e33ee5844620d97f8bb8469984634c088 +size 280242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..fd40969467 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:512eb09ba371d2667eb749921c8e6a4e42da0d3448bdc5b3f30b0b16eb70fbda +size 317750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp deleted file mode 100644 index 72f329db05..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1dbdc8e54e2187ffb345d5a797c9a7039066fb746f253220121095af454924dd -size 300203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index bd8f1bf51e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff17b22e983563501c3339ee7160475e2086cada9e957afb9a1ff89e901b8446 -size 221445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index a94e66978c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_E4m3_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ca2dbad0de7cd53b39225d96ceaa071e1b27ce3feff861691b7ada7bde9e9411 -size 254415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index cea5c48ce0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b99d98321f44aea8334cfa4dea7317fc0e7dd2eec9b972106bfae5e1bf740bf -size 296825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 304dcd84b4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ef2353b8bcd016f7f35df660d958cb1b310a944915f655d6984441b264b6855 -size 330685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp deleted file mode 100644 index 21625a8ff0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x128x256_epilogueTile128x128_mma128x128x32_cluster1x1x1_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3841f0321dbfe7785af9b2c10f4178b664bf6b8166b02ac670ba379868a292b8 -size 296947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index e110963d8a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4915aeb2ad55d39bc8dc15d9d528710452271ac3959b44c93fdee4fe625dfdb8 -size 221493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 0bc89b9de3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp16_E4m3_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x32_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9416865d0b59145c1ed6db76342e013dfa21137679aac4b56a44cf862fec0f62 -size 254465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..cc3d8308b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740dafaf3387aebc26cf0f84f442e249a07e2e19eb491878b3b2e493385c917a +size 510880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..b0bbbefafd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x64_m128x128x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5126cb0e627e4aa8429f749e6dbaf72a0e037dcb19155495a5118a12c6999ccf +size 533036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..8b6f631f82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41bcdc0ecf4a7c5f17f27dcbce34adbe2dacf5a21ff60d3ff3f0facc9c2b934 +size 460112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..3341bb1633 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ddc81099541541cc6bffc424eeeacaac65dc546e452807411185563ddac54c0 +size 480836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..73d1aff821 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eaa189e0806e7bf7b7eab87f41671e5ba025b24145787b92de902b5d82b1de8 +size 469240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..5cce64f57b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fbfb3dd74af8c7df9f881a27b84d0a4749c5acdfe2e6926276f88c41f9a4072 +size 490012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..f954a81914 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50792340aaeac5ddce249a1d71340ffbb64ad316853711f68259b1381bfd08dd +size 484878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..dff1971d5d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x64x256u2_s5_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f29d835200bfdba36127454dce36c790fd60aacdc75e2769759d8f80ffc9f374 +size 505552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..73001460e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dbfe2d07d4ad0f6bf9d67f61e44f3969d95678f2e11d90927ae1b6f1fdaa091 +size 454236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..d21de0a685 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:214b46b7a53b611e09ddb1ae46b587b4d51d15cb41c82c755078b1c9f47ee5ef +size 497370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..f32f3fc2b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d24dc12e89606b4a16e678e0eda061ee67efa30d51c204517149f042f55576af +size 474960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..98b5db44a1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89772090a327c1e6eb5112e614d5aeee460a9e6f4537df6213dee4eb46f15a8 +size 518142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..b75e35c627 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a270b335a08608ac16b7a902e0774c79da2a421b6c978edb7a2b0e9004b00470 +size 439634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..ffa2ef1f0c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f00c59326d6305e5ded33913026b1b11d1d8ca1bb0173edc59071f0cede07de4 +size 483902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..a6ebd4a64d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8f777d86c3c8f7d5063b0f2b64e07a5f96bef1c5179b5b0edcf1082a6ec7b36 +size 465142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index c9201422eb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x1_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8570922d77439b30ef9f1bd0f1de2e48cd0688c52d4c5d7b48a8ab9df80678bb -size 297023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp deleted file mode 100644 index 108db9b8d1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E2m1_Fp32_tile128x8x256_epilogueTile128x8_mma128x8x64_cluster1x1x4_splitK4_transposeMmaOutput_sm100a_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6d9b1bac0bdb963a4b5ae0d2c63c77495dcab658079b742b59ec9c9965c9fbea -size 331573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..52a4b8a9b0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408f5703a7343818daa01875126d6fe7c1c376e811ab5a212353c30e99a3e9c1 +size 324696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..d6cebfafc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x128x128u2_s3_et128x64_m128x128x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:167411f3fcb8e9469c2cc484d77b5909868a6fc0c7200f0c4afa2b4c5a4de8be +size 337724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..86a667a70a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3400f3894e7b4f8d62d32fa7d8ba376b6fc52f159258752011792fcf34f225 +size 269784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..cf3633e571 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b7dff6dc295ad05b14742fc2b910718ac3305631d4e5deda5f1d9e1771055b +size 284342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..d882f1015b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa4dafad95d4bb122a1692e01c902510c1ba3b975d72d9866bd7fdcb633d29de +size 279700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..dad6e6f612 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9337b4267d7d924dc8a326297227719ca1597523f8ae58767fbcd3197aee26d5 +size 294308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..9e1f4c65b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:115ea20ffa04cf8680d7ec0c77620bb9fe6c5dc38558f13f45423e87f7503bd4 +size 295044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..fe3bcd032a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a81b57b3c44c1e15e521f907c14b8ce2a95b07386224687f011dad52046d17 +size 309650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..3fba867b55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c68394e249ada5d065ecfcb4737a750e82d02fe3573c50054d5b2102c3c377 +size 265832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..7f61931b67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f00d349cbdde3bea50fb98fd3bec3e7788379f7c0a47ecd5efe21139738c016 +size 303342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..0dbde721c2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a884dcd66ba39d8e3ce8d424f572d2e48fefe5d40a87ca89fecead78a0ee86f +size 280390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp new file mode 100644 index 0000000000..61ec3dcdd3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/cubins/GemmGatedActKernel_Fp32_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x4_16dp256b_splitK4_TN_transOut_schedS_swiGlu_sm100a_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62c13b2577da5bcc7f26c47e842f915d833e7d785343c9a2673828168a223f4f +size 318688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h index 0efa93faf5..680f22271a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CommonUtils.h @@ -16,6 +16,9 @@ */ #pragma once +namespace gemmGatedAct +{ + namespace trtllm { namespace gen @@ -41,3 +44,5 @@ inline T roundUp(T m, T n) } // namespace gen } // namespace trtllm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h index 5d31c37411..60e07c6fce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/CudaKernelLauncher.h @@ -22,6 +22,9 @@ #include #include #endif +namespace gemmGatedAct +{ + namespace trtllm { namespace gen @@ -89,3 +92,5 @@ inline CUresult launchKernel(void* kernelParams, void* cudaStream, int32_t smemS } // namespace gen } // namespace trtllm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h index a6892f12ca..ce0670f9e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h @@ -20,6 +20,11 @@ #include #include #include +#ifndef TLLM_GEN_EXPORT_INTERFACE +#include "trtllm/gen/MmaDecl.h" +#else +#include "MmaDecl.h" +#endif //////////////////////////////////////////////////////////////////////////////////////////////////// // @@ -28,6 +33,9 @@ // //////////////////////////////////////////////////////////////////////////////////////////////////// +namespace gemmGatedAct +{ + namespace trtllm { namespace gen @@ -55,26 +63,25 @@ enum class Dtype : uint32_t // clang-format off Bfloat16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 16u, /*uid*/ 0u), Bool = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 1u, /*uid*/ 1u), - PadType = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 2u), - E2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 3u), - E2m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 4u), - E3m2 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 5u), - E4m3 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 6u), - E5m2 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 7u), - Fp16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 16u, /*uid*/ 8u), - Fp32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 32u, /*uid*/ 9u), - Int8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 10u), - Int32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 11u), - Int64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 12u), - MxE2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 13u), - MxE4m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 14u), - UE8m0 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 15u), - UInt8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 16u), - UInt16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 16u, /*uid*/ 17u), - UInt32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 18u), - UInt64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 19u), - UInt128 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 128u, /*uid*/ 20u), - Void = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 0u, /*uid*/ 21u), + E2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 2u), + E2m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 3u), + E3m2 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 6u, /*uid*/ 4u), + E4m3 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 5u), + E5m2 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 6u), + Fp16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 16u, /*uid*/ 7u), + Fp32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 32u, /*uid*/ 8u), + Int8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 9u), + Int32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 10u), + Int64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 11u), + MxE2m1 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 4u, /*uid*/ 12u), + MxE4m3 = TLLM_ENCODE_DTYPE(/*block*/ 1u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 13u), + UE8m0 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 0u, /*bits*/ 8u, /*uid*/ 14u), + UInt8 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 8u, /*uid*/ 15u), + UInt16 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 16u, /*uid*/ 16u), + UInt32 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 32u, /*uid*/ 17u), + UInt64 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 64u, /*uid*/ 18u), + UInt128 = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 0u, /*int*/ 1u, /*bits*/ 128u, /*uid*/ 19u), + Void = TLLM_ENCODE_DTYPE(/*block*/ 0u, /*signed*/ 1u, /*int*/ 0u, /*bits*/ 0u, /*uid*/ 20u), // clang-format on #undef TLLM_ENCODE_DTYPE @@ -153,6 +160,7 @@ inline std::string dtypeToString(Dtype dtype) case Dtype::Int32: return "Int32"; case Dtype::Int64: return "Int64"; case Dtype::MxE4m3: return "MxE4m3"; + case Dtype::MxE2m1: return "MxE2m1"; case Dtype::UE8m0: return "UE8m0"; case Dtype::UInt8: return "UInt8"; case Dtype::UInt16: return "UInt16"; @@ -205,5 +213,50 @@ inline Dtype dtypeGetBlockSfType(Dtype dtype) //////////////////////////////////////////////////////////////////////////////////////////////////// +inline MmaKind dtypeGetMmaKind(Dtype dtypeA, Dtype dtypeB) +{ + auto dtypeEltA = dtypeEltType(dtypeA); + auto dtypeEltB = dtypeEltType(dtypeB); + + // Note: the order of the conditions is important here. + if ((dtypeA == Dtype::Fp16 && dtypeB == Dtype::Fp16) || (dtypeA == Dtype::Bfloat16 && dtypeB == Dtype::Bfloat16)) + { + return MmaKind::Fp16; + } + + if ((dtypeA == Dtype::Int8 || dtypeA == Dtype::UInt8) && (dtypeB == Dtype::Int8 || dtypeB == Dtype::UInt8)) + { + return MmaKind::Int8; + } + + // This statement captures both MxE2m1 and E2m1. + if (dtypeEltA == Dtype::E2m1 && dtypeEltB == Dtype::E2m1) + { + return MmaKind::MxFp4NvFp4; + } + + if ((dtypeA == Dtype::E4m3 || dtypeA == Dtype::E5m2 || dtypeA == Dtype::E2m3 || dtypeA == Dtype::E3m2 + || dtypeA == Dtype::E2m1) + && (dtypeB == Dtype::E4m3 || dtypeB == Dtype::E5m2 || dtypeB == Dtype::E2m3 || dtypeB == Dtype::E3m2 + || dtypeB == Dtype::E2m1)) + { + return MmaKind::Fp8Fp6Fp4; + } + + // At this point we know that both dtypes are Mx types and not both MxE2m1 at the same time. + if ((dtypeEltA == Dtype::E4m3 || dtypeEltA == Dtype::E5m2 || dtypeEltA == Dtype::E2m3 || dtypeEltA == Dtype::E3m2 + || dtypeEltA == Dtype::E2m1) + && (dtypeEltB == Dtype::E4m3 || dtypeEltB == Dtype::E5m2 || dtypeEltB == Dtype::E2m3 || dtypeEltB == Dtype::E3m2 + || dtypeEltB == Dtype::E2m1)) + { + return MmaKind::MxFp8Fp6Fp4; + } + return MmaKind::Tf32; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gen } // namespace trtllm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h new file mode 100644 index 0000000000..f3822f89fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/MmaDecl.h @@ -0,0 +1,90 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & + * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace gemmGatedAct +{ + +namespace trtllm +{ +namespace gen +{ + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// The kind of the MMA instruction +enum class MmaKind : uint32_t +{ + // For Blackwell this follows the PTX ISA description of the MMA instructions. + // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-kind-shapes + + // The MMA type is auto-detected from the dtypes of the input tensors + Auto = 0, + // Supports dtypeA = dtypeB = Fp16 and dtypeD = [Fp16, Fp32] + // or dtypeA = dtypeB = Bfloat16 and dtypeD = [Fp32] + // Corresponds to the kind::f16 of tcgen05.mma. + Fp16 = 1, + // Supports dtypeA/B = [E4m3, E5m2, E2m3, E3m2, E2m1] and dtypeD = [Fp16, Fp32] + // Corresponds to the kind::f8f6f4 of tcgen05.mma. + Fp8Fp6Fp4 = 2, + // Supports dtypeA = dtypeB = [Int8, Uint8] and dtypeD = [Int32] + // Corresponds to the kind::i8 of tcgen05.mma. + Int8 = 3, + // Supports dtypeA = dtypeB = [MxE2m1, E2m1] with block scale [UM8e0, UEm4e3] + // and dtypeD = [Fp32] + // Corresponds to the kind::mxf4nvf4 of tcgen05.mma. + MxFp4NvFp4 = 4, + // Supports dtype dtypeA = dtypeB = [MxE4m3, MxE2m1] with block scale [UM8e0] + // and dtypeD = [Fp32] + // Corresponds to the kind::mxf8f6f4 of tcgen05.mma. + MxFp8Fp6Fp4 = 5, + // Supports dtypeA = dtypeB = Tf32 with dtypeD = [Fp32] + // Corresponds to the kind::tf32 of tcgen05.mma. + Tf32 = 6 +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline bool mmaKindIsBlockFmt(MmaKind mmaKind) +{ + return mmaKind == MmaKind::MxFp8Fp6Fp4 || mmaKind == MmaKind::MxFp4NvFp4; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// For logging and error reporting +inline std::string mmaKindToString(MmaKind mmaKind) +{ + switch (mmaKind) + { + case MmaKind::Auto: return "Auto"; + case MmaKind::Fp16: return "Fp16"; + case MmaKind::Fp8Fp6Fp4: return "Fp8Fp6Fp4"; + case MmaKind::Int8: return "Int8"; + case MmaKind::MxFp4NvFp4: return "MxFp4NvFp4"; + case MmaKind::MxFp8Fp6Fp4: return "MxFp8Fp6Fp4"; + case MmaKind::Tf32: return "Tf32"; + default: assert(false); return "Unsupported type"; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gen +} // namespace trtllm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h index f86c383259..9dca3cce24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/trtllmGen_gatedAct_export/trtllm/gen/SfLayoutDecl.h @@ -26,6 +26,9 @@ // //////////////////////////////////////////////////////////////////////////////////////////////////// +namespace gemmGatedAct +{ + namespace trtllm { namespace gen @@ -89,3 +92,5 @@ inline std::string sfLayoutToString(SfLayout layout) } // namespace gen } // namespace trtllm + +} // namespace gemmGatedAct diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp index b64f9b7111..32631b1f5e 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp @@ -205,6 +205,7 @@ extern torch::Tensor fp8_block_scaling_gemm(torch::Tensor const& mat1, torch::Te auto const sm = tensorrt_llm::common::getSMVersion(); switch (sm) { + case 103: return fp8_block_scale_gemm_blackwell(mat1, mat2, mat1Scale, mat2Scale); case 100: return fp8_block_scale_gemm_blackwell(mat1, mat2, mat1Scale, mat2Scale); case 90: return fp8_block_scaling_gemm_hopper(mat1, mat2, mat1Scale, mat2Scale); case 89: return fp8_block_scaling_gemm_ada(mat1, mat2, mat1Scale, mat2Scale); diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp index f0fd52868f..169970a037 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp @@ -53,11 +53,11 @@ void runGemm(at::Tensor& out, at::Tensor const& mat1, at::Tensor const& mat2, at stream.stream(), mat1.get_device()); } -template +template void runGemmGatedAct(at::Tensor& out, at::Tensor const& mat1, at::Tensor const& mat2, at::Tensor const& globalScale, at::Tensor const& globalScaleGate, int64_t m, int64_t n, int64_t k, bool lowLatencyKernel) { - auto eltType = trtllm::gen::Dtype::E4m3; + auto eltType = gemmGatedAct::trtllm::gen::Dtype::E4m3; tensorrt_llm::kernels::TrtllmGenGemmGatedActRunnerOptions options = {.eltType = eltType, .outputType = outDtype, .deepSeekFp8 = false, .transposeMmaOutput = lowLatencyKernel}; @@ -117,15 +117,15 @@ torch::Tensor fp8_per_tensor_scaling_tllmg_gemm_impl(torch::Tensor const& mat1, switch (outDtype.value()) { case at::ScalarType::Half: - runGemmGatedAct( + runGemmGatedAct( out, mat1, mat2, globalScale, globalScaleGate.value(), m, n, k, lowLatencyKernel); break; case at::ScalarType::BFloat16: - runGemmGatedAct( + runGemmGatedAct( out, mat1, mat2, globalScale, globalScaleGate.value(), m, n, k, lowLatencyKernel); break; case at::ScalarType::Float8_e4m3fn: - runGemmGatedAct( + runGemmGatedAct( out, mat1, mat2, globalScale, globalScaleGate.value(), m, n, k, lowLatencyKernel); break; default: C10_THROW_ERROR(NotImplementedError, "outDtype must be one of fp16/bf16/e4m3.");